This is the code to run statistical simulation and comparison results with ctgan simulated datasets. This is for regression, binary classification and right censored survival analysis on health and medical datasets. This code has one dataset for each type and all figures and tables output for the paper.
# .libPaths("/dskh/nobackup/yunwei/aigan_simulation/aigan_library")
# setwd("/dskh/nobackup/yunwei/aigan_simulation")
# library(inline)
# openblas.set.num.threads <- cfunction( signature(ipt="integer"),
# body = "openblas_set_num_threads(*ipt);",
# otherdefs = c ("extern void openblas_set_num_threads(int);"),
# libargs = c ("/usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3"),
# language = "C",
# convention = ".C"
# )
# openblas.set.num.threads(1)
#install.packages("Matrix",version="1.6.4")
# #package_version("Matrix")
library(Matrix)
# install.packages("htmltools",version="0.5.7")
# library(htmltools)
# install.packages("vctrs",version="0.6.4")
# library(vctrs)
# install.packages("scales",version="1.3.0")
# library(scales)
library(survival)
#install.packages("iterators")
library(glmnet)
library(ncvreg)
# install.packages("fansi")
# install.packages("utf8")
# install.packages("pkgconfig")
library(ggplot2)
library(hdnom)
library(Hmisc)
library(survAUC)
library(divo)
library(transport)
library(MASS)
library(tidyverse)
library(purrr)
library(glmnet)
library(modgo)
library(caret)
library(corrplot)
library(reshape2)#testing: need to delete the causing issue if conditions with the "apply" argument
modgo <- function(data,ties_method= "max", variables= colnames(data),
bin_variables= c(),categ_variables= c(),
n_samples= nrow(data),sigma= c(),nrep= 100,
noise_mu= FALSE, pertr_vec= c(),
change_cov= c(),change_amount= 0,seed= 1,
thresh_var= c(), thresh_force = FALSE,
var_prop= c(),var_infl=c(),infl_cov_stable=FALSE) {
if (!is.na(seed)){
# Setting Seed
set.seed(seed)
}
#Check input
if (!is.data.frame(data)){
stop("Data is not a data frame")
}
if (any(is.na(data))){
stop("Data set contains NA's")
}
if (!all(variables %in% colnames(data))){
stop("Variables are not column names of data ")
}
if (length(sigma)>0 & (colnames(sigma)!= variables ||
rownames(sigma)!= variables)){
stop("Sigma column and row names and not exactly equal to variable names")
}
# Include only selected variables in the original dataset
data <- data[,variables]
if(!all(apply(data, 2, function(x) is.numeric(x)))){
stop("Data should only contain numerical values")
}
if (!all(bin_variables %in% variables)){
stop("Binary variables are not part of the provided variables")
}
for (bin in bin_variables){
if(length(which(data[,bin] %in% c(0,1)))<length(data[,bin])) {
stop(paste0("Binary variable",bin,"is neither 0 nor 1"))
}
}
if (!all(categ_variables %in% variables)){
stop("Categorical variables are not part of the provided variables")
}
if (!all(thresh_var[,1] %in% variables)){
stop("Threshold variables are not part of the provided variables")
}
# Find the continuous variables
continuous_var <- setdiff(variables,c(bin_variables,categ_variables))
if (length(pertr_vec) > 0 && !all(names(pertr_vec) %in% continuous_var) ){
stop("Pertrubation variables are not part of the provided
continuous variables")
}
if (length(var_infl) > 0 && !all(names(var_infl) %in% continuous_var) ){
stop("Variance inflation variables are not part of the provided
continuous variables")
}
if (!is.logical(infl_cov_stable)){
stop("Inflation variance stable should be TRUE/FALSE")
}
if(any(names(pertr_vec) %in% names(var_infl))){
stop("Perturb vector cannot have common variables with variance inflation vector")
}
if (change_amount!=0 && length(change_cov)==0){
stop("You need to provide a pair of variables(change_cov)
to change their covariance values")
}
if (change_amount==0 && length(change_cov)==2){
stop("You need to provide an amount of change (change_amount)")
}
if (!(length(change_cov) %in% c(0,2))) {
stop("You need to provide two variables")
}
# Noise in multivariate distributions centers
if (noise_mu == TRUE){
ns <- rnorm(ncol(data),mean = 0,sd=1)
} else {
ns <- matrix(0,nrow = 1,ncol = ncol(data))
}
if (length(var_prop)>0 & !all(names(var_prop) %in% bin_variables)){
stop("Name of var_prop should be a binary variable")
}
if (length(var_prop)>0 & (var_prop>1 || var_prop <0)){
stop("Variable proportion should be between 0 and 1")
}
if (length(var_prop)>1 ){
stop("You cannot set proportion for more than 1 variable")
}
if (length(var_prop) ==1 & length(thresh_var[,1]) >0 ){
stop("You cannot set a variable proportion and
a variable threshold at the same run ")
}
OriginalData <- data
# Normal run in case user does not provide a sigma
if(length(sigma)==0){
# Rank transformation of each column of the data set
df_rbi <- data
for(j in 1:ncol(df_rbi)) {
df_rbi[[j]] <- rbi_normal_transform(data[[j]],ties_method = ties_method)
}
Sigma <- cov(df_rbi)
diag(Sigma) <- 1
categ_var <- c()
for (ct_var in categ_variables){
if(length(unique(data[,ct_var])) > 7){
print(
paste0(ct_var,
" will be treated as continuous due to having more than 7 unique values."))
}else {
categ_var <- c(categ_var,ct_var)
}
}
subst_list <- vector(mode="list",length = length(categ_var))
names(subst_list) <- categ_var
for (cate in categ_var){
unq_values <- sort(unique(data[,cate]))
values = unq_values
subst_list[[cate]] <- values
fake_data <- vector(length = length(data[,cate]))
for (i in c(1:length(unq_values))){
fake_data[which(data[,cate]==values[i])] <- i
}
data[,cate] <- fake_data
}
oldw <- getOption("warn")
options(warn = -1)
Sigma <- suppressMessages(Sigma_transformation(
data=data,data_z = df_rbi,Sigma,
variables = variables,
bin_variables = bin_variables,
categ_variables=categ_var))
options(warn = oldw)
for (cate in categ_var){
values <- subst_list[[cate]]
fake_data <- vector(length = length(data[,cate]))
for (i in c(1:length(values))) {
fake_data[which(data[,cate]==i)] <- values[i]
}
data[,cate] <- fake_data
}
}else {Sigma <- sigma}
# Change specified pairs of covariance matrix by specified amount
if (change_amount!=0 && length(change_cov)==2) {
Sigma[change_cov[1],change_cov[2]] <-
Sigma[change_cov[2],change_cov[1]] <-
Sigma[change_cov[2],change_cov[1]] + change_amount
}
# Check Sigma if it is positive definite
if(!all(eigen(Sigma)$value > 0)){
print("Covariance matrix is not positive definite.")
print("It will be replaced with nearest positive definite matrix")
Sigma <- Matrix::nearPD(Sigma,corr = TRUE)$mat
}
# Check Sigma if it contains NA's
if (any(is.na(Sigma))){
stop("Correlation of data contains NA's")
}
## Inflation analysis - stable covariance matrix
if ( length(var_infl) >0 && infl_cov_stable==TRUE){
#Inverse transformation of each variable
for(j in 1:ncol(data)) {
if(colnames(data)[[j]] %in% names(var_infl)){
p <- var_infl[which(
names(var_infl)==colnames(data)[[j]])]
data[[j]] <- data[[j]] +
rnorm(length(data[[j]]),mean = 0,sd=sd(sqrt(p)*data[[j]]))
}
}
}
## Thresholds
if (length(thresh_var[,1]) >0){
mt_sim <- MASS::mvrnorm(n = n_samples,
mu = rep(0, ncol(data)) +ns,
Sigma = Sigma)
df_sim <- data.frame(mt_sim)
names(df_sim) <- names(data)
#Inverse transformation of each variable
for(j in 1:ncol(df_sim)) {
df_sim[[j]] <- rbi_normal_transform_inv(df_sim[[j]], data[[j]])
}
for (i in c(1:length(thresh_var[,1]))){
if(is.na(thresh_var[i,2]))
{low_thresh <-
min(df_sim[thresh_var[i,1]])-1}else{low_thresh <-thresh_var[i,2]}
if(is.na(thresh_var[i,3]))
{up_thresh <-
max(df_sim[thresh_var[i,1]])+1}else{up_thresh <-thresh_var[i,3]}
df_sim <- df_sim[which(df_sim[thresh_var[i,1]]< up_thresh &
df_sim[thresh_var[i,1]]> low_thresh),]
}
if(length(df_sim[,1])/n_samples <0.1 & thresh_force == FALSE ){
stop("The propotion of simulated samples passing
the threshold is less than 10% ")
}else if(length(df_sim[,1])==0){
stop("The propotion of simulated samples passing
the threshold is 0% ")
}else{thresh_multi <- 1.1*n_samples/length(df_sim[,1])}
}else{thresh_multi=1}
# Binary variables proportions
if (length(var_prop) ==1 ){
mt_sim <- MASS::mvrnorm(n = n_samples,
mu = rep(0, ncol(data)) +ns,
Sigma = Sigma)
df_sim <- data.frame(mt_sim)
names(df_sim) <- names(data)
#Inverse transformation of each variable
for(j in 1:ncol(df_sim)) {
df_sim[[j]] <- rbi_normal_transform_inv(df_sim[[j]], data[[j]])
}
counts_1 <- length(which(df_sim[,names(var_prop)]==1))
counts_0 <- length(which(df_sim[,names(var_prop)]==0))
req_1 <- var_prop * n_samples
req_0 <- (1-var_prop) * n_samples
if (req_1 >= counts_1){
thresh_multi <- (1.1*req_1)/counts_1
}else if (req_1 < counts_1){
thresh_multi <- (1.1*req_0)/counts_0
}
}
# Starting loop for many repetions
Correlations <- vector(mode = "list", length = nrep+1)
mean_corr <- matrix(0,nrow =ncol(data),ncol = ncol(data))
SimulatedData <- vector(mode = "list", length = nrep)
i <- 1
counter <- 0
# Loop for creating new datasets and obtaining their mean correlations
while (i < nrep+1){
mt_sim <- MASS::mvrnorm(n = ceiling(n_samples*thresh_multi),
mu = rep(0, ncol(data)) + ns,
Sigma = Sigma)
df_sim <- data.frame(mt_sim)
names(df_sim) <- names(data)
#Inverse transformation of each variable
for(j in 1:ncol(df_sim)) {
df_sim[[j]] <- rbi_normal_transform_inv(df_sim[[j]], data[[j]])
if(colnames(data)[[j]] %in% names(pertr_vec)){
p <- pertr_vec[which(
names(pertr_vec)==colnames(data)[[j]])]
df_sim[[j]] <- (df_sim[[j]]*sqrt(1-p)) +
rnorm(length(df_sim[[j]]),mean = 0,sd=sd(df_sim[[j]])*sqrt(p))
}
if(colnames(data)[[j]] %in% names(var_infl) && infl_cov_stable == FALSE){
p <- var_infl[which(
names(var_infl)==colnames(data)[[j]])]
df_sim[[j]] <- df_sim[[j]] +
rnorm(length(df_sim[[j]]),mean = 0,sd=sd( sqrt(p)*df_sim[[j]]))
}
}
#Threshold process
if(length(thresh_var[,1])>0){
for (j in c(1:length(thresh_var[,1]))){
if(is.na(thresh_var[j,2]))
{low_thresh <-
min(df_sim[thresh_var[j,1]])-1}else{low_thresh <-thresh_var[j,2]}
if(is.na(thresh_var[j,3]))
{up_thresh <-
max(df_sim[thresh_var[j,1]])+1}else{up_thresh <-thresh_var[j,3]}
df_sim <- df_sim[which(df_sim[thresh_var[j,1]]< up_thresh &
df_sim[thresh_var[j,1]]> low_thresh),]
}
}
#Proportion process
if(length(var_prop)==1){
rounded_length <- ceiling(n_samples*var_prop)
df_sim_1 <- df_sim[which(df_sim[,names(var_prop)] == 1),]
df_sim_0 <- df_sim[which(df_sim[,names(var_prop)] == 0),]
df_sim <- rbind(df_sim_1[c(1:rounded_length),],
df_sim_0[c(1:(n_samples-rounded_length)),])
}
if(length(df_sim[,1]) < n_samples || is.null(df_sim) || any(is.na(suppressWarnings(cor(df_sim)))) ){
i <- i-1
counter <-counter +1
} else {
df_sim <- df_sim[c(1:n_samples),]
#Correlation calculation
Correlations[[i]] <- cor(df_sim)
SimulatedData[[i]] <- df_sim
#Mean correlation calculation
mean_corr <- mean_corr + (Correlations[[i]]/nrep)
}
if(counter > 100*nrep){
stop("Could not create simulated datasets with this specific thresholds")
}
i <- i + 1
}
Correlations[[nrep+1]] <- mean_corr
names(Correlations) <- c(paste0("rep",seq(1:nrep)),"Mean")
results <-list(SimulatedData,OriginalData,Correlations,
bin_variables,categ_variables,Sigma,seed,n_samples,nrep)
names(results) <-c("SimulatedData","OriginalData","Correlations",
"Binary_variables","Categorical_variables",
"Covariance_Matrix","Seed","Samples_Produced",
"Sim_Dataset_Number")
return(results)
}
Sigma_transformation <- function(data,data_z,Sigma,variables,
bin_variables=c(),categ_variables=c()) {
cont_variables <- setdiff(variables,c(bin_variables,categ_variables))
#Binary correlations
for(bin_var in bin_variables) {
for(cont_var in cont_variables) {
invisible(capture.output( aux <- psych::biserial(x = data_z[[cont_var]]
, y = data[[bin_var]])))
Sigma[cont_var, bin_var] <- Sigma[bin_var, cont_var] <- aux
}
for(cat_variables in categ_variables) {
invisible(capture.output( aux <- psych::polychoric(
x = data[, c(cat_variables, bin_var)],global = FALSE)$rho[1, 2]))
Sigma[cat_variables, bin_var] <- Sigma[bin_var, cat_variables] <- aux
}
for(second_bin_var in bin_variables) {
if(second_bin_var != bin_var) {
invisible(capture.output( aux <- psych::tetrachoric(
x = data[,c(bin_var,second_bin_var)])$rho[1, 2]))
Sigma[second_bin_var, bin_var] <- Sigma[bin_var, second_bin_var] <- aux
}
}
}
#Categorical correlations
for(cat_var in categ_variables){
for(cont_var in cont_variables) {
invisible(capture.output( aux <- psych::polyserial(x = data_z[[cont_var]],
y = as.matrix(data[[cat_var]]))))
Sigma[cont_var, cat_var] <- Sigma[cat_var, cont_var] <- aux
}
for(second_cat_var in categ_variables) {
if(second_cat_var != cat_var) {
invisible(capture.output( aux <- psych::polychoric(
x = data[, c(cat_var,second_cat_var)])$rho[1, 2]))
Sigma[second_cat_var, cat_var] <- Sigma[cat_var, second_cat_var] <- aux
}
}
}
return(Sigma)
}vitd1=read.csv("vitd_3.csv")
vitd1=vitd1[,!colnames(vitd1)%in%c("SEQN","wave","RIAGENDR","Cholesterol")]
vitd1=na.omit(vitd1)
table(vitd1$Gender)
table(vitd1$Race)
table(vitd1$Education)
vitd1$Gender=ifelse(vitd1$Gender=="Males",0,1)
vitd1$Race=ifelse(vitd1$Race=="White",1,ifelse(vitd1$Race=="Black",2,ifelse(vitd1$Race=="Hispanic",3,4)))
vitd1$Education=ifelse(vitd1$Education=="no_high",1,ifelse(vitd1$Education=="high_grad",2,ifelse(vitd1$Education=="college_entry",3,ifelse(vitd1$Education=="college_grad",4,999))))
# zip_path <- "10-1055-a-2048-7692-s22010082/modgo.tar.gz"
# # Install the package from the zip file
# install.packages(zip_path, repos = NULL, type = "source")
modgo_dt=vitd1[,!colnames(vitd1)=="vitD"]
# library("modgo")
binary_variables <- colnames(modgo_dt)[13]
categorical_variables <- colnames(modgo_dt)[8:12]
# test <- modgo(data = vitd1,bin_variables = binary_variables,categ_variables = categorical_variables,nrep = 1,ties_method= "max", variables= colnames(vitd1),
# n_samples= nrow(data),sigma= c(),
# noise_mu= FALSE, pertr_vec= c(),
# change_cov= c(),change_amount= 0,seed= 1,
# thresh_var= c(), thresh_force = FALSE,
# var_prop= c(),var_infl=c(),infl_cov_stable=FALSE)
test <- modgo(data = modgo_dt,bin_variables = binary_variables,categ_variables = categorical_variables,nrep = 1,ties_method= "max", variables= colnames(modgo_dt),
n_samples= 1000,sigma= c(),
noise_mu= FALSE, pertr_vec= c(),
change_cov= c(),change_amount= 0,seed= 1,
thresh_var= c(), thresh_force = FALSE,
var_prop= c(),var_infl=c(),infl_cov_stable=FALSE)
modgo_new1=test[[1]][[1]]
#generate output var
formula_str <- paste(colnames(modgo_dt), collapse = " + ")
formula_obj <- as.formula(paste("vitD ~", formula_str))
lm_model <- lm(formula_obj, data = vitd1)
summary(lm_model)
beta <- coef(lm_model)[-1] # Exclude intercept
# Simulate new data based on the regression model
set.seed(20240421) # Set seed for reproducibility
vitD <- as.matrix(modgo_new1) %*% as.matrix(beta) + rnorm(1000,sd = 1)
vitd1_new1=cbind.data.frame(vitD,modgo_new1)
#write.csv(vitd1_new1,file="vitd_3_stat.csv")vitd1=read.csv("vitd_3.csv")
vitd1=na.omit(vitd1)
integer_cols <- sapply(vitd1, is.integer)
vitd1[integer_cols] <- lapply(vitd1[integer_cols], as.numeric)
vitd1=vitd1[,!colnames(vitd1)%in%c("SEQN","wave","RIAGENDR","Cholesterol")]
# table(vitd1$Gender)
# table(vitd1$Race)
# table(vitd1$Education)
# vitd1$Gender=ifelse(vitd1$Gender=="Males",0,1)
# vitd1$Race=ifelse(vitd1$Race=="White",1,ifelse(vitd1$Race=="Black",2,ifelse(vitd1$Race=="Hispanic",3,4)))
# vitd1$Education=ifelse(vitd1$Education=="no_high",1,ifelse(vitd1$Education=="high_grad",2,ifelse(vitd1$Education=="college_entry",3,ifelse(vitd1$Education=="college_grad",4,999))))
vitd1_new1=read.csv("vitd_3_stat.csv")
vitd1_new1$Gender=ifelse(vitd1_new1$Gender==0,"Males","Females")
vitd1_new1$Race=ifelse(vitd1_new1$Race==1,"White",ifelse(vitd1_new1$Race==2,"Black",ifelse(vitd1_new1$Race==3,"Hispanic","Other")))
vitd1_new1$Education=ifelse(vitd1_new1$Education==1,"no_high",ifelse(vitd1_new1$Education==2,"high_grad",ifelse(vitd1_new1$Education==3,"college_entry",ifelse(vitd1_new1$Education==4,"college_grad","Other"))))
vitd1_ctgan1=read.csv("vitd_3_sim.csv")
integer_cols <- sapply(vitd1_ctgan1, is.integer)
vitd1_ctgan1[integer_cols] <- lapply(vitd1_ctgan1[integer_cols], as.numeric)
vitd1_ctgan1=vitd1_ctgan1[,!colnames(vitd1_ctgan1)%in%c("SEQN","wave","RIAGENDR","Cholesterol")]
# vitd1_ctgan1$Gender=ifelse(vitd1_ctgan1$Gender=="Males",0,1)
# vitd1_ctgan1$Race=ifelse(vitd1_ctgan1$Race=="White",1,ifelse(vitd1_ctgan1$Race=="Black",2,ifelse(vitd1_ctgan1$Race=="Hispanic",3,4)))
# vitd1_ctgan1$Education=ifelse(vitd1_ctgan1$Education=="no_high",1,ifelse(vitd1_ctgan1$Education=="high_grad",2,ifelse(vitd1_ctgan1$Education=="college_entry",3,ifelse(vitd1_ctgan1$Education=="college_grad",4,999))))
# visualisations
# distribution of covaraite
# Create a data frame with the three vectors
df <- data.frame(value = c(vitd1$Calcium, vitd1_new1$Calcium, vitd1_ctgan1$Calcium),
group = c(rep("Real", nrow(vitd1)),rep( "Stat", nrow(vitd1_new1)), rep("CTGAN", nrow(vitd1_ctgan1))))
df$group=factor(df$group, levels=c("Real","CTGAN","Stat"))
# Plot the distribution curves
ggplot(df, aes(x = value, fill = group)) +
geom_density(alpha = 0.3) +
labs(title = "Distribution of Calcium", x = "Value", y = "Density") +
scale_fill_manual(values = c("red", "blue", "green"))+theme_bw()g1=ggplot(df, aes(x = value, color = group, linetype = group)) +
geom_density(size = 1) +
labs(title = "Distribution of Calcium", x = "Value", y = "Density") +
scale_color_manual(values = c("red", "blue", "green")) +
theme_bw()
# df <- data.frame(value = c(vitd1$Creatinine, vitd1_new1$Creatinine, vitd1_ctgan1$Creatinine),
# group = c(rep("Real", nrow(vitd1)),rep( "Stat", nrow(vitd1_new1)), rep("CTGAN", nrow(vitd1_ctgan1))))
# ggplot(df, aes(x = value, fill = group)) +
# geom_density(alpha = 0.3) +
# labs(title = "Distribution of Creatinine", x = "Value", y = "Density") +
# scale_fill_manual(values = c("blue", "red", "green"))+theme_bw()
#
# df <- data.frame(value = c(vitd1$Glucose_serum, vitd1_new1$Glucose_serum, vitd1_ctgan1$Glucose_serum),
# group = c(rep("Real", nrow(vitd1)),rep( "Stat", nrow(vitd1_new1)), rep("CTGAN", nrow(vitd1_ctgan1))))
# ggplot(df, aes(x = value, fill = group)) +
# geom_density(alpha = 0.3) +
# labs(title = "Distribution of Glucose_serum", x = "Value", y = "Density") +
# scale_fill_manual(values = c("blue", "red", "green"))+theme_bw()
#
# df <- data.frame(value = c(vitd1$Iron, vitd1_new1$Iron, vitd1_ctgan1$Iron),
# group = c(rep("Real", nrow(vitd1)),rep( "Stat", nrow(vitd1_new1)), rep("CTGAN", nrow(vitd1_ctgan1))))
# ggplot(df, aes(x = value, fill = group)) +
# geom_density(alpha = 0.3) +
# labs(title = "Distribution of Iron", x = "Value", y = "Density") +
# scale_fill_manual(values = c("blue", "red", "green"))+theme_bw()
df <- data.frame(value = c(vitd1$Sodium, vitd1_new1$Sodium, vitd1_ctgan1$Sodium),
group = c(rep("Real", nrow(vitd1)),rep( "Stat", nrow(vitd1_new1)), rep("CTGAN", nrow(vitd1_ctgan1))))
df$group=factor(df$group, levels=c("Real","CTGAN","Stat"))
ggplot(df, aes(x = value, fill = group)) +
geom_density(alpha = 0.3) +
labs(title = "Distribution of Sodium", x = "Value", y = "Density") +
scale_fill_manual(values = c("red", "blue", "green"))+theme_bw()g2=ggplot(df, aes(x = value, color = group, linetype = group)) +
geom_density(size = 1) +
labs(title = "Distribution of Sodium", x = "Value", y = "Density") +
scale_color_manual(values = c("red", "blue", "green")) +
theme_bw()
# df <- data.frame(value = c(vitd1$protein, vitd1_new1$protein, vitd1_ctgan1$protein),
# group = c(rep("Real", nrow(vitd1)),rep( "Stat", nrow(vitd1_new1)), rep("CTGAN", nrow(vitd1_ctgan1))))
# ggplot(df, aes(x = value, fill = group)) +
# geom_density(alpha = 0.3) +
# labs(title = "Distribution of protein", x = "Value", y = "Density") +
# scale_fill_manual(values = c("blue", "red", "green"))+theme_bw()
# distribution of response
df <- data.frame(value = c(vitd1$vitD, vitd1_new1$vitD, vitd1_ctgan1$vitD),
group = c(rep("Real", nrow(vitd1)),rep( "Stat", nrow(vitd1_new1)), rep("CTGAN", nrow(vitd1_ctgan1))))
df$group=factor(df$group, levels=c("Real","CTGAN","Stat"))
ggplot(df, aes(x = value, fill = group)) +
geom_density(alpha = 0.3) +
labs(title = "Distribution of vitD", x = "Value", y = "Density") +
scale_fill_manual(values = c("red", "blue", "green"))+theme_bw()g3=ggplot(df, aes(x = value, color = group, linetype = group)) +
geom_density(size = 1) +
labs(title = "Distribution of vitD", x = "Value", y = "Density") +
scale_color_manual(values = c("red", "blue", "green")) +
theme_bw()
library(gridExtra)
ff=grid.arrange(g1,g2,g3,nrow = 3,ncol=1)#ggsave(file="scale.pdf",ff,height =10 ,width = 5)
# model prediction and feature selection (train on synthtic and evaluation on real)(Brett K Beaulieu-Jones, Zhiwei Steven Wu, Chris Williams, Ran Lee, Sanjeev P Bhavnani,James Brian Byrd, and Casey S Greene. Privacy-preserving generative deep neural networks support clinical data sharing. Circulation: Cardiovascular Quality and Outcomes, 12(7):e005122, 2019.)
train_and_test_lm <- function(train_data, test_data) {
# Train a linear regression model on the training data
formula_str <- paste(colnames(vitd1_new1)[c(3:length(colnames(vitd1_new1)))], collapse = " + ")
formula_obj <- as.formula(paste("vitD ~", formula_str))
lm_model <- lm(formula_obj, data = train_data)
# Make predictions on the test data using the trained model
test_predictions <- predict(lm_model, newdata = test_data)
# Calculate the mean squared error (MSE) on the test data
mse <- mean((test_data$vitD - test_predictions)^2,na.rm=TRUE)
# Return the coefficients and MSE
return(list(coefficients = coef(lm_model), mse = mse))
}
stat_pred=train_and_test_lm(train_data=vitd1_new1,test_data=vitd1)
ctgan_pred=train_and_test_lm(train_data=vitd1_ctgan1,test_data=vitd1)
featuredt=rbind.data.frame(stat_pred$coefficients,ctgan_pred$coefficients)
colnames(featuredt)=names(stat_pred$coefficients)
#featuredt
featuredt$mse=c(stat_pred[[2]],ctgan_pred[[2]])
rownames(featuredt)=c("stat","ctgan")
featuredt## (Intercept) RIDAGEYR Calcium Creatinine Glucose_serum Iron
## stat 3.484649 0.1062590 11.828664 3.201254 -0.07338859 0.07090323
## ctgan 3.491721 0.1946531 8.149863 2.373267 -0.01473635 0.08444229
## Sodium protein RaceHispanic RaceOther RaceWhite
## stat -0.6793970 -9.2284789 2.368408 5.014823 -2.199954
## ctgan -0.1529279 -0.1609921 -13.376768 -14.777236 -22.785321
## Educationcollege_grad Educationhigh_grad Educationno_high EducationOther
## stat 0.03695613 -0.04463778 0.1714031 3.758243
## ctgan -4.38322149 -3.95002544 -6.0249384 -6.826060
## Marriage Total_household Total_income GenderMales NA mse
## stat -0.6214702 -1.52424241 0.007009987 -2.5960833 3.4846491 12414.770
## ctgan 0.2650512 -0.03717054 -0.372737486 0.0557212 0.1526231 520.317
# sanity checks (https://github.com/vanderschaarlab/synthcity)(not in the paper, decided not doing here)
# also, the privacy related metrics are not used here
# similarity scores
similarity_metric=function(vec1,vec2){
#KLD:not symmetric, the ref distribution is the second distribution
# Function to compute Kullback-Leibler Divergence (KLD)
KL_divergence <- function(p, q) {
return(sum(p * log(p / q)))
}
# Compute KLD for the given distributions
KLD <- KL_divergence(vec1, vec2)
# Compute the average inverse of KLD
avg_inv_KLD <- 1 / mean(KLD)
# ks test
ks_result <- ks.test(vec1, vec2)
ks_result$statistic
ks_result$p.value
# chi-squared test: not applicable
# Jensen-Shannon distance (metric)
# install.packages("philentropy")
# library(divo)
JSD <- function(P, Q) {
M <- 0.5 * (P + Q)
#js_divergence <- 0.5 * (KL_divergence(P, M) + KL_divergence(Q, M))
kl_divergence <- function(P, Q) sum(P * log2(P / Q))
js_divergence <- 0.5 * (kl_divergence(P, M) + kl_divergence(Q, M))
return(sqrt(js_divergence))
}
js_distance <- JSD(vec1, vec2)
#tested the same results
# library(philentropy)
# x=rbind(vec1,vec2)
# philentropy::JSD(x)
# Wasserstein Distance
# install.packages("transport")
# library(transport)
cost_matrix <- outer(vec1, vec2, "-")^2
wasserstein_distance = purrr::possibly(function(vec1,vec2){return(transport(vec1, vec2, costm = cost_matrix))},
otherwise = NA)#this gives all for each individual pair
wasserstein_distance2 =wasserstein_distance(vec1,vec2)
#calculate an average
wasserstein_distance_avg=purrr::possibly(function(wasserstein_distance){mean(wasserstein_distance$mass)}, otherwise = NA)
wasserstein_distance_avg2=wasserstein_distance_avg(wasserstein_distance2)
result_vec=c(avg_inv_KLD,as.numeric(ks_result$statistic),js_distance,wasserstein_distance_avg2)
return(result_vec)}
similarity_Calcium=rbind.data.frame(similarity_metric(vec1=vitd1_new1$Calcium,vec2=vitd1$Calcium),similarity_metric(vec1=vitd1_ctgan1$Calcium,vec2=vitd1$Calcium))
colnames(similarity_Calcium)=c("avg_inv_KLD","ks","JS","Wasserstein")
rownames(similarity_Calcium)=c("stat","ctgan")
print("similarity_Calcium")## [1] "similarity_Calcium"
## avg_inv_KLD ks JS Wasserstein
## stat 0.004244126 0.01403629 7.576467 8.933357e-05
## ctgan 0.002180598 0.11367533 7.427806 8.933357e-05
# similarity_Creatinine=rbind.data.frame(similarity_metric(vec1=vitd1_new1$Creatinine,vec2=vitd1$Creatinine),similarity_metric(vec1=vitd1_ctgan1$Creatinine,vec2=vitd1$Creatinine))
# colnames(similarity_Creatinine)=c("avg_inv_KLD","ks","JS","Wasserstein")
# rownames(similarity_Creatinine)=c("stat","ctgan")
# print("similarity_Creatinine")
# similarity_Creatinine
# similarity_Glucose_serum=rbind.data.frame(similarity_metric(vec1=vitd1_new1$Glucose_serum,vec2=vitd1$Glucose_serum),similarity_metric(vec1=vitd1_ctgan1$Glucose_serum,vec2=vitd1$Glucose_serum))
# colnames(similarity_Glucose_serum)=c("avg_inv_KLD","ks","JS","Wasserstein")
# rownames(similarity_Glucose_serum)=c("stat","ctgan")
# print("similarity_Glucose_serum")
# similarity_Glucose_serum
#
# similarity_Iron=rbind.data.frame(similarity_metric(vec1=vitd1_new1$Iron,vec2=vitd1$Iron),similarity_metric(vec1=vitd1_ctgan1$Iron,vec2=vitd1$Iron))
# colnames(similarity_Iron)=c("avg_inv_KLD","ks","JS","Wasserstein")
# rownames(similarity_Iron)=c("stat","ctgan")
# print("similarity_Iron")
# similarity_Iron
#
# similarity_Sodium=rbind.data.frame(similarity_metric(vec1=vitd1_new1$Sodium,vec2=vitd1$Sodium),similarity_metric(vec1=vitd1_ctgan1$Sodium,vec2=vitd1$Sodium))
# colnames(similarity_Sodium)=c("avg_inv_KLD","ks","JS","Wasserstein")
# rownames(similarity_Sodium)=c("stat","ctgan")
# print("similarity_Sodium")
# similarity_Sodium
# similarity_protein=rbind.data.frame(similarity_metric(vec1=vitd1_new1$protein,vec2=vitd1$protein),similarity_metric(vec1=vitd1_ctgan1$protein,vec2=vitd1$protein))
# colnames(similarity_protein)=c("avg_inv_KLD","ks","JS","Wasserstein")
# rownames(similarity_protein)=c("stat","ctgan")
# print("similarity_protein")
# similarity_protein
similarity_vitD=rbind.data.frame(similarity_metric(vec1=vitd1_new1$vitD,vec2=vitd1$vitD),similarity_metric(vec1=vitd1_ctgan1$vitD,vec2=vitd1$vitD))
colnames(similarity_vitD)=c("avg_inv_KLD","ks","JS","Wasserstein")
rownames(similarity_vitD)=c("stat","ctgan")
print("similarity_vitD")## [1] "similarity_vitD"
## avg_inv_KLD ks JS Wasserstein
## stat NaN 1.00000000 NaN 8.933357e-05
## ctgan 1.26875e-05 0.06078568 191.1033 8.933357e-05
# categorical variables plot
get_category_proportions <- function(df,data_name) {
categorical_cols <- sapply(df, is.character)
df[categorical_cols] <- lapply(df[categorical_cols], factor)
# Get names of categorical variables
cat_vars <- names(df)[sapply(df, is.factor)]
# Create an empty data frame to store proportions
prop_df <- data.frame(Category = character(), Proportion = numeric(), Data_Frame = character())
# Loop through each categorical variable
for (var in cat_vars) {
# Get proportions of categories
prop <- table(df[[var]]) / length(df[[var]])
# Add proportions to the data frame
prop_df <- bind_rows(prop_df, data.frame(Category = names(prop), Proportion = prop, Data_Frame = var))
}
prop_df=prop_df[,c("Category","Data_Frame","Proportion.Freq")]
colnames(prop_df)=c("Category","Var","Proportion")
prop_df$Data=data_name
return(prop_df)
}
get_category_proportions(vitd1,"real")## Category Var Proportion Data
## 1 Black Race 0.489455615 real
## 2 Hispanic Race 0.287101520 real
## 3 Other Race 0.047474252 real
## 4 White Race 0.175968612 real
## 5 college_entry Education 0.267974497 real
## 6 college_grad Education 0.197940167 real
## 7 high_grad Education 0.235703776 real
## 8 no_high Education 0.296714076 real
## 9 Other Education 0.001667484 real
## 10 Females Gender 0.513683178 real
## 11 Males Gender 0.486316822 real
## Category Var Proportion Data
## 1 Black Race 0.476 stat
## 2 Hispanic Race 0.299 stat
## 3 Other Race 0.048 stat
## 4 White Race 0.177 stat
## 5 college_entry Education 0.251 stat
## 6 college_grad Education 0.210 stat
## 7 high_grad Education 0.242 stat
## 8 no_high Education 0.296 stat
## 9 Other Education 0.001 stat
## 10 Females Gender 0.492 stat
## 11 Males Gender 0.508 stat
## Category Var Proportion Data
## 1 Black Race 0.477 ctgan
## 2 Hispanic Race 0.275 ctgan
## 3 Other Race 0.065 ctgan
## 4 White Race 0.183 ctgan
## 5 Education 0.178 ctgan
## 6 college_entry Education 0.204 ctgan
## 7 college_grad Education 0.177 ctgan
## 8 high_grad Education 0.135 ctgan
## 9 no_high Education 0.296 ctgan
## 10 Other Education 0.010 ctgan
## 11 Females Gender 0.504 ctgan
## 12 Males Gender 0.496 ctgan
# Plot
prop_df1 <- get_category_proportions(vitd1,"real")
prop_df2 <- get_category_proportions(vitd1_new1,"stat")
prop_df3 <- get_category_proportions(vitd1_ctgan1,"ctgan")
# Combine proportions from all data frames
combined_prop_df <- bind_rows(prop_df1, prop_df2, prop_df3)
# Plot
ggplot(combined_prop_df, aes(x = Category, y = Proportion, fill = Data)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of Category Proportions",
x = "Category",
y = "Proportion",
fill = "Data Frame") +
theme_minimal()+
facet_wrap(~ Var, scales = "free")#corelation plot
variables=c("RIDAGEYR", "Calcium" ,"Creatinine" , "Glucose_serum" , "Iron" , "Sodium" , "protein" , "Race", "Education" , "Marriage" ,"Total_household" ,"Total_income" , "Gender")
vitd1$Gender=ifelse(vitd1$Gender=="Males",0,1)
vitd1$Race=ifelse(vitd1$Race=="White",1,ifelse(vitd1$Race=="Black",2,ifelse(vitd1$Race=="Hispanic",3,4)))
vitd1$Education=ifelse(vitd1$Education=="no_high",1,ifelse(vitd1$Education=="high_grad",2,ifelse(vitd1$Education=="college_entry",3,ifelse(vitd1$Education=="college_grad",4,999))))
vitd1_new1$Gender=ifelse(vitd1_new1$Gender=="Males",0,1)
vitd1_new1$Race=ifelse(vitd1_new1$Race=="White",1,ifelse(vitd1_new1$Race=="Black",2,ifelse(vitd1_new1$Race=="Hispanic",3,4)))
vitd1_new1$Education=ifelse(vitd1_new1$Education=="no_high",1,ifelse(vitd1_new1$Education=="high_grad",2,ifelse(vitd1_new1$Education=="college_entry",3,ifelse(vitd1_new1$Education=="college_grad",4,999))))
vitd1_ctgan1$Gender=ifelse(vitd1_ctgan1$Gender=="Males",0,1)
vitd1_ctgan1$Race=ifelse(vitd1_ctgan1$Race=="White",1,ifelse(vitd1_ctgan1$Race=="Black",2,ifelse(vitd1_ctgan1$Race=="Hispanic",3,4)))
vitd1_ctgan1$Education=ifelse(vitd1_ctgan1$Education=="no_high",1,ifelse(vitd1_ctgan1$Education=="high_grad",2,ifelse(vitd1_ctgan1$Education=="college_entry",3,ifelse(vitd1_ctgan1$Education=="college_grad",4,999))))
cor1 <- cor(as.matrix(vitd1[,variables]),use = "pairwise.complete.obs")
cor2 <- cor(as.matrix(vitd1_new1[,variables]),use = "pairwise.complete.obs")
cor3 <- cor(as.matrix(vitd1_ctgan1[,variables]),use = "pairwise.complete.obs")
par(mfrow = c(1, 3))
p1=corrplot::corrplot(cor1,tl.col = "black")
p2=corrplot::corrplot(cor2,tl.col = "black")
p3=corrplot::corrplot(cor3,tl.col = "black")vitd1=read.csv("vitd_3.csv")
vitd1=vitd1[,!colnames(vitd1)%in%c("SEQN","wave","RIAGENDR","Cholesterol")]
vitd1=na.omit(vitd1)
table(vitd1$Gender)##
## Females Males
## 5237 4958
vitd1$Gender=ifelse(vitd1$Gender=="Males",0,1)
vitd1$Race=ifelse(vitd1$Race=="White",1,ifelse(vitd1$Race=="Black",2,ifelse(vitd1$Race=="Hispanic",3,4)))
vitd1$Education=ifelse(vitd1$Education=="no_high",1,ifelse(vitd1$Education=="high_grad",2,ifelse(vitd1$Education=="college_entry",3,ifelse(vitd1$Education=="college_grad",4,999))))
# zip_path <- "10-1055-a-2048-7692-s22010082/modgo.tar.gz"
# # Install the package from the zip file
# install.packages(zip_path, repos = NULL, type = "source")
covariates=vitd1[,1:8]
covariates<- as.data.frame(apply(covariates, 2, function(x) scale(x)))
vitd1[,1:8]<- covariates
#write_csv(vitd1,"vitd_3_scaled_all.csv")
modgo_dt=vitd1[,!colnames(vitd1)=="vitD"]
# library("modgo")
binary_variables <- colnames(modgo_dt)[13]
categorical_variables <- colnames(modgo_dt)[8:12]
test <- modgo(data = modgo_dt,bin_variables = binary_variables,categ_variables = categorical_variables,nrep = 1,ties_method= "max", variables= colnames(modgo_dt),
n_samples= 1000,sigma= c(),
noise_mu= FALSE, pertr_vec= c(),
change_cov= c(),change_amount= 0,seed= 1,
thresh_var= c(), thresh_force = FALSE,
var_prop= c(),var_infl=c(),infl_cov_stable=FALSE)## [1] "Marriage will be treated as continuous due to having more than 7 unique values."
## [1] "Total_income will be treated as continuous due to having more than 7 unique values."
modgo_new1=test[[1]][[1]]
#generate output var
formula_str <- paste(colnames(modgo_dt), collapse = " + ")
formula_obj <- as.formula(paste("vitD ~", formula_str))
lm_model <- lm(formula_obj, data = vitd1)
summary(lm_model)##
## Call:
## lm(formula = formula_obj, data = vitd1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9729 -0.6582 -0.0651 0.5709 7.1381
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0030416 0.0364312 -0.083 0.933
## RIDAGEYR 0.0743912 0.0112834 6.593 4.53e-11 ***
## Calcium 0.1715653 0.0103601 16.560 < 2e-16 ***
## Creatinine 0.0533336 0.0101465 5.256 1.50e-07 ***
## Glucose_serum -0.1113667 0.0099587 -11.183 < 2e-16 ***
## Iron 0.0979097 0.0098383 9.952 < 2e-16 ***
## Sodium -0.0629885 0.0099976 -6.300 3.09e-10 ***
## protein -0.1733742 0.0102651 -16.890 < 2e-16 ***
## Race 0.0900385 0.0126336 7.127 1.10e-12 ***
## Education 0.0001639 0.0002332 0.703 0.482
## Marriage -0.0254626 0.0039548 -6.438 1.26e-10 ***
## Total_household -0.0581861 0.0063103 -9.221 < 2e-16 ***
## Total_income 0.0001438 0.0006130 0.235 0.814
## Gender 0.0994556 0.0202443 4.913 9.12e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9514 on 10181 degrees of freedom
## Multiple R-squared: 0.09606, Adjusted R-squared: 0.09491
## F-statistic: 83.23 on 13 and 10181 DF, p-value: < 2.2e-16
beta <- coef(lm_model)[-1] # Exclude intercept
# Simulate new data based on the regression model
set.seed(20240421) # Set seed for reproducibility
vitD <- as.matrix(modgo_new1) %*% as.matrix(beta) + rnorm(1000,sd = 1)
vitd1_new1=cbind.data.frame(vitD,modgo_new1)
#write.csv(vitd1_new1,file="vitd_3_stat_scaled_all.csv")vitd1=read.csv("vitd_3_scaled_all.csv")
vitd1=na.omit(vitd1)
integer_cols <- sapply(vitd1, is.integer)
vitd1[integer_cols] <- lapply(vitd1[integer_cols], as.numeric)
vitd1=vitd1[,!colnames(vitd1)%in%c("SEQN","wave","RIAGENDR","Cholesterol")]
# table(vitd1$Gender)
# table(vitd1$Race)
# table(vitd1$Education)
# vitd1$Gender=ifelse(vitd1$Gender=="Males",0,1)
# vitd1$Race=ifelse(vitd1$Race=="White",1,ifelse(vitd1$Race=="Black",2,ifelse(vitd1$Race=="Hispanic",3,4)))
# vitd1$Education=ifelse(vitd1$Education=="no_high",1,ifelse(vitd1$Education=="high_grad",2,ifelse(vitd1$Education=="college_entry",3,ifelse(vitd1$Education=="college_grad",4,999))))
vitd1_new1=read.csv("vitd_3_stat_scaled_all.csv")
# vitd1_new1$Gender=ifelse(vitd1_new1$Gender==0,"Males","Females")
# vitd1_new1$Race=ifelse(vitd1_new1$Race==1,"White",ifelse(vitd1_new1$Race==2,"Black",ifelse(vitd1_new1$Race==3,"Hispanic","Other")))
# vitd1_new1$Education=ifelse(vitd1_new1$Education==1,"no_high",ifelse(vitd1_new1$Education==2,"high_grad",ifelse(vitd1_new1$Education==3,"college_entry",ifelse(vitd1_new1$Education==4,"college_grad","Other"))))
vitd1_ctgan1=read.csv("vitd_3_sim_scaled_all.csv")
integer_cols <- sapply(vitd1_ctgan1, is.integer)
vitd1_ctgan1[integer_cols] <- lapply(vitd1_ctgan1[integer_cols], as.numeric)
vitd1_ctgan1=vitd1_ctgan1[,!colnames(vitd1_ctgan1)%in%c("SEQN","wave","RIAGENDR","Cholesterol")]
# vitd1_ctgan1$Gender=ifelse(vitd1_ctgan1$Gender=="Males",0,1)
# vitd1_ctgan1$Race=ifelse(vitd1_ctgan1$Race=="White",1,ifelse(vitd1_ctgan1$Race=="Black",2,ifelse(vitd1_ctgan1$Race=="Hispanic",3,4)))
# vitd1_ctgan1$Education=ifelse(vitd1_ctgan1$Education=="no_high",1,ifelse(vitd1_ctgan1$Education=="high_grad",2,ifelse(vitd1_ctgan1$Education=="college_entry",3,ifelse(vitd1_ctgan1$Education=="college_grad",4,999))))
# visualisations
# distribution of covaraite
# Create a data frame with the three vectors
df <- data.frame(value = c(vitd1$Calcium, vitd1_new1$Calcium, vitd1_ctgan1$Calcium),
group = c(rep("Real", nrow(vitd1)),rep( "Stat", nrow(vitd1_new1)), rep("CTGAN", nrow(vitd1_ctgan1))))
df$group=factor(df$group, levels=c("Real","CTGAN","Stat"))
# Plot the distribution curves
ggplot(df, aes(x = value, fill = group)) +
geom_density(alpha = 0.3) +
labs(title = "Distribution of Calcium", x = "Value", y = "Density") +
scale_fill_manual(values = c("red", "blue", "green"))+theme_bw()g1=ggplot(df, aes(x = value, color = group, linetype = group)) +
geom_density(size = 1) +
labs(title = "Distribution of Calcium", x = "Value", y = "Density") +
scale_color_manual(values = c("red", "blue", "green")) +
theme_bw()
# df <- data.frame(value = c(vitd1$Creatinine, vitd1_new1$Creatinine, vitd1_ctgan1$Creatinine),
# group = c(rep("Real", nrow(vitd1)),rep( "Stat", nrow(vitd1_new1)), rep("CTGAN", nrow(vitd1_ctgan1))))
# ggplot(df, aes(x = value, fill = group)) +
# geom_density(alpha = 0.3) +
# labs(title = "Distribution of Creatinine", x = "Value", y = "Density") +
# scale_fill_manual(values = c("blue", "red", "green"))+theme_bw()
#
# df <- data.frame(value = c(vitd1$Glucose_serum, vitd1_new1$Glucose_serum, vitd1_ctgan1$Glucose_serum),
# group = c(rep("Real", nrow(vitd1)),rep( "Stat", nrow(vitd1_new1)), rep("CTGAN", nrow(vitd1_ctgan1))))
# ggplot(df, aes(x = value, fill = group)) +
# geom_density(alpha = 0.3) +
# labs(title = "Distribution of Glucose_serum", x = "Value", y = "Density") +
# scale_fill_manual(values = c("blue", "red", "green"))+theme_bw()
#
# df <- data.frame(value = c(vitd1$Iron, vitd1_new1$Iron, vitd1_ctgan1$Iron),
# group = c(rep("Real", nrow(vitd1)),rep( "Stat", nrow(vitd1_new1)), rep("CTGAN", nrow(vitd1_ctgan1))))
# ggplot(df, aes(x = value, fill = group)) +
# geom_density(alpha = 0.3) +
# labs(title = "Distribution of Iron", x = "Value", y = "Density") +
# scale_fill_manual(values = c("blue", "red", "green"))+theme_bw()
df <- data.frame(value = c(vitd1$Sodium, vitd1_new1$Sodium, vitd1_ctgan1$Sodium),
group = c(rep("Real", nrow(vitd1)),rep( "Stat", nrow(vitd1_new1)), rep("CTGAN", nrow(vitd1_ctgan1))))
df$group=factor(df$group, levels=c("Real","CTGAN","Stat"))
ggplot(df, aes(x = value, fill = group)) +
geom_density(alpha = 0.3) +
labs(title = "Distribution of Sodium", x = "Value", y = "Density") +
scale_fill_manual(values = c("red", "blue", "green"))+theme_bw()g2=ggplot(df, aes(x = value, color = group, linetype = group)) +
geom_density(size = 1) +
labs(title = "Distribution of Sodium", x = "Value", y = "Density") +
scale_color_manual(values = c("red", "blue", "green")) +
theme_bw()
# df <- data.frame(value = c(vitd1$protein, vitd1_new1$protein, vitd1_ctgan1$protein),
# group = c(rep("Real", nrow(vitd1)),rep( "Stat", nrow(vitd1_new1)), rep("CTGAN", nrow(vitd1_ctgan1))))
# ggplot(df, aes(x = value, fill = group)) +
# geom_density(alpha = 0.3) +
# labs(title = "Distribution of protein", x = "Value", y = "Density") +
# scale_fill_manual(values = c("blue", "red", "green"))+theme_bw()
# distribution of response
df <- data.frame(value = c(vitd1$vitD, vitd1_new1$vitD, vitd1_ctgan1$vitD),
group = c(rep("Real", nrow(vitd1)),rep( "Stat", nrow(vitd1_new1)), rep("CTGAN", nrow(vitd1_ctgan1))))
df$group=factor(df$group, levels=c("Real","CTGAN","Stat"))
ggplot(df, aes(x = value, fill = group)) +
geom_density(alpha = 0.3) +
labs(title = "Distribution of vitD", x = "Value", y = "Density") +
scale_fill_manual(values = c("red", "blue", "green"))+theme_bw()g3=ggplot(df, aes(x = value, color = group, linetype = group)) +
geom_density(size = 1) +
labs(title = "Distribution of vitD", x = "Value", y = "Density") +
scale_color_manual(values = c("red", "blue", "green")) +
theme_bw()
library(gridExtra)
ff=grid.arrange(g1,g2,g3,nrow = 3,ncol=1)#ggsave(file="scale2.pdf",ff,height =10 ,width = 5)
# model prediction and feature selection (train on synthtic and evaluation on real)(Brett K Beaulieu-Jones, Zhiwei Steven Wu, Chris Williams, Ran Lee, Sanjeev P Bhavnani,James Brian Byrd, and Casey S Greene. Privacy-preserving generative deep neural networks support clinical data sharing. Circulation: Cardiovascular Quality and Outcomes, 12(7):e005122, 2019.)
train_and_test_lm <- function(train_data, test_data) {
# Train a linear regression model on the training data
formula_str <- paste(colnames(vitd1_new1)[c(3:length(colnames(vitd1_new1)))], collapse = " + ")
formula_obj <- as.formula(paste("vitD ~", formula_str))
lm_model <- lm(formula_obj, data = train_data)
# Make predictions on the test data using the trained model
test_predictions <- predict(lm_model, newdata = test_data)
# Calculate the mean squared error (MSE) on the test data
mse <- mean((test_data$vitD - test_predictions)^2,na.rm=TRUE)
# Return the coefficients and MSE
return(list(coefficients = coef(lm_model), mse = mse))
}
stat_pred=train_and_test_lm(train_data=vitd1_new1,test_data=vitd1)
ctgan_pred=train_and_test_lm(train_data=vitd1_ctgan1,test_data=vitd1)
featuredt=rbind.data.frame(stat_pred$coefficients,ctgan_pred$coefficients)
colnames(featuredt)=names(stat_pred$coefficients)
#featuredt
featuredt$mse=c(stat_pred[[2]],ctgan_pred[[2]])
rownames(featuredt)=c("stat","ctgan")
featuredt## (Intercept) RIDAGEYR Calcium Creatinine Glucose_serum Iron
## stat -0.11921376 0.07770115 0.20036293 0.05944629 -0.10917695 0.08782520
## ctgan -0.09020911 0.07227575 -0.01573203 0.17321290 -0.03486512 0.01741634
## Sodium protein Race Education Marriage
## stat -0.02571598 -0.1384860 0.143633212 -0.0004005039 0.005775506
## ctgan 0.02584416 -0.1814424 0.005331894 -0.0003489841 -0.008790168
## Total_household Total_income Gender mse
## stat -0.092789712 0.002611729 0.15032585 0.9202062
## ctgan -0.004831162 0.002092903 0.07304619 0.9864258
# similarity scores
similarity_metric=function(vec1,vec2){
#KLD:not symmetric, the ref distribution is the second distribution
# Function to compute Kullback-Leibler Divergence (KLD)
KL_divergence <- function(p, q) {
return(sum(p * log(p / q),na.rm = TRUE))
}
# Compute KLD for the given distributions
KLD <- KL_divergence(vec1, vec2)
# Compute the average inverse of KLD
avg_inv_KLD <- 1 / mean(KLD)
# ks test
ks_result <- ks.test(vec1, vec2)
ks_result$statistic
ks_result$p.value
# chi-squared test: not applicable
# Jensen-Shannon distance (metric)
# install.packages("philentropy")
# library(divo)
JSD <- function(P, Q) {
M <- 0.5 * (P + Q)
#js_divergence <- 0.5 * (KL_divergence(P, M) + KL_divergence(Q, M))
kl_divergence <- function(P, Q) sum(P * log2(P / Q),na.rm = TRUE)
js_divergence <- 0.5 * (kl_divergence(P, M) + kl_divergence(Q, M))
return(sqrt(js_divergence))
}
js_distance <- JSD(vec1, vec2)
#tested the same results
# library(philentropy)
# x=rbind(vec1,vec2)
# philentropy::JSD(x)
# Wasserstein Distance
# install.packages("transport")
# library(transport)
cost_matrix <- outer(vec1, vec2, "-")^2
wasserstein_distance = purrr::possibly(function(vec1,vec2){return(transport(vec1, vec2, costm = cost_matrix))},
otherwise = NA)#this gives all for each individual pair
wasserstein_distance2 =wasserstein_distance(vec1,vec2)
#calculate an average
wasserstein_distance_avg=purrr::possibly(function(wasserstein_distance){mean(wasserstein_distance$mass)}, otherwise = NA)
wasserstein_distance_avg2=wasserstein_distance_avg(wasserstein_distance2)
result_vec=c(avg_inv_KLD,as.numeric(ks_result$statistic),js_distance,wasserstein_distance_avg2)
return(result_vec)}
similarity_Calcium=rbind.data.frame(similarity_metric(vec1=vitd1_new1$Calcium,vec2=vitd1$Calcium),similarity_metric(vec1=vitd1_ctgan1$Calcium,vec2=vitd1$Calcium))
colnames(similarity_Calcium)=c("avg_inv_KLD","ks","JS","Wasserstein")
rownames(similarity_Calcium)=c("stat","ctgan")
print("similarity_Calcium")## [1] "similarity_Calcium"
## avg_inv_KLD ks JS Wasserstein
## stat -0.001818836 0.12364934 18.60005 0.0001966182
## ctgan -0.003068392 0.08064934 26.50573 0.0001951981
similarity_vitD=rbind.data.frame(similarity_metric(vec1=vitd1_new1$vitD,vec2=vitd1$vitD),similarity_metric(vec1=vitd1_ctgan1$vitD,vec2=vitd1$vitD))
colnames(similarity_vitD)=c("avg_inv_KLD","ks","JS","Wasserstein")
rownames(similarity_vitD)=c("stat","ctgan")
print("similarity_vitD")## [1] "similarity_vitD"
## avg_inv_KLD ks JS Wasserstein
## stat -0.005480601 0.05942276 17.34282 0.0001888218
## ctgan -0.003739434 0.08841000 22.89678 0.0001862891
similarity_RIDAGEYR=rbind.data.frame(similarity_metric(vec1=vitd1_new1$RIDAGEYR,vec2=vitd1$RIDAGEYR),similarity_metric(vec1=vitd1_ctgan1$RIDAGEYR,vec2=vitd1$RIDAGEYR))
colnames(similarity_RIDAGEYR)=c("avg_inv_KLD","ks","JS","Wasserstein")
rownames(similarity_RIDAGEYR)=c("stat","ctgan")
print("similarity_RIDAGEYR")## [1] "similarity_RIDAGEYR"
similarity_Creatinine=rbind.data.frame(similarity_metric(vec1=vitd1_new1$Creatinine,vec2=vitd1$Creatinine),similarity_metric(vec1=vitd1_ctgan1$Creatinine,vec2=vitd1$Creatinine))
colnames(similarity_Creatinine)=c("avg_inv_KLD","ks","JS","Wasserstein")
rownames(similarity_Creatinine)=c("stat","ctgan")
print("similarity_Creatinine")## [1] "similarity_Creatinine"
similarity_Glucose_serum=rbind.data.frame(similarity_metric(vec1=vitd1_new1$Glucose_serum,vec2=vitd1$Glucose_serum),similarity_metric(vec1=vitd1_ctgan1$Glucose_serum,vec2=vitd1$Glucose_serum))
colnames(similarity_Glucose_serum)=c("avg_inv_KLD","ks","JS","Wasserstein")
rownames(similarity_Glucose_serum)=c("stat","ctgan")
print("similarity_Glucose_serum")## [1] "similarity_Glucose_serum"
similarity_Iron=rbind.data.frame(similarity_metric(vec1=vitd1_new1$Iron,vec2=vitd1$Iron),similarity_metric(vec1=vitd1_ctgan1$Iron,vec2=vitd1$Iron))
colnames(similarity_Iron)=c("avg_inv_KLD","ks","JS","Wasserstein")
rownames(similarity_Iron)=c("stat","ctgan")
print("similarity_Iron")## [1] "similarity_Iron"
similarity_Sodium=rbind.data.frame(similarity_metric(vec1=vitd1_new1$Sodium,vec2=vitd1$Sodium),similarity_metric(vec1=vitd1_ctgan1$Sodium,vec2=vitd1$Sodium))
colnames(similarity_Sodium)=c("avg_inv_KLD","ks","JS","Wasserstein")
rownames(similarity_Sodium)=c("stat","ctgan")
print("similarity_Sodium")## [1] "similarity_Sodium"
similarity_protein=rbind.data.frame(similarity_metric(vec1=vitd1_new1$protein,vec2=vitd1$protein),similarity_metric(vec1=vitd1_ctgan1$protein,vec2=vitd1$protein))
colnames(similarity_protein)=c("avg_inv_KLD","ks","JS","Wasserstein")
rownames(similarity_protein)=c("stat","ctgan")
print("similarity_protein")## [1] "similarity_protein"
library(writexl)
# List of your data frames
list_of_dfs <- list(similarity_Calcium = similarity_Calcium, similarity_protein = similarity_protein, similarity_Sodium = similarity_Sodium,similarity_Iron=similarity_Iron,similarity_Glucose_serum=similarity_Glucose_serum,similarity_Creatinine=similarity_Creatinine,similarity_RIDAGEYR=similarity_RIDAGEYR,similarity_vitD=similarity_vitD)
#write_xlsx(list_of_dfs, "vitd_similarity.xlsx")
get_category_proportions <- function(df,data_name) {
categorical_cols <- c("Race","Education","Marriage","Total_household","Total_income","Gender")
df[categorical_cols] <- lapply(df[categorical_cols], factor)
# Get names of categorical variables
cat_vars <- names(df)[sapply(df, is.factor)]
# Create an empty data frame to store proportions
prop_df <- data.frame(Category = character(), Proportion = numeric(), Data_Frame = character())
# Loop through each categorical variable
for (var in cat_vars) {
# Get proportions of categories
prop <- table(df[[var]]) / length(df[[var]])
# Add proportions to the data frame
prop_df <- bind_rows(prop_df, data.frame(Category = names(prop), Proportion = prop, Data_Frame = var))
}
prop_df=prop_df[,c("Category","Data_Frame","Proportion.Freq")]
colnames(prop_df)=c("Category","Var","Proportion")
prop_df$Data=data_name
return(prop_df)
}
get_category_proportions(vitd1,"real")## Category Var Proportion Data
## 1 1 Race 0.1759686121 real
## 2 2 Race 0.4894556155 real
## 3 3 Race 0.2871015204 real
## 4 4 Race 0.0474742521 real
## 5 1 Education 0.2967140755 real
## 6 2 Education 0.2357037764 real
## 7 3 Education 0.2679744973 real
## 8 4 Education 0.1979401667 real
## 9 999 Education 0.0016674841 real
## 10 1 Marriage 0.5282000981 real
## 11 2 Marriage 0.0870034331 real
## 12 3 Marriage 0.1094654242 real
## 13 4 Marriage 0.0337420304 real
## 14 5 Marriage 0.1653751839 real
## 15 6 Marriage 0.0757233938 real
## 16 77 Marriage 0.0003923492 real
## 17 99 Marriage 0.0000980873 real
## 18 1 Total_household 0.1397743992 real
## 19 2 Total_household 0.3091711623 real
## 20 3 Total_household 0.1710642472 real
## 21 4 Total_household 0.1624325650 real
## 22 5 Total_household 0.1002452182 real
## 23 6 Total_household 0.0535556645 real
## 24 7 Total_household 0.0637567435 real
## 25 1 Total_income 0.0187346739 real
## 26 2 Total_income 0.0400196175 real
## 27 3 Total_income 0.0758214811 real
## 28 4 Total_income 0.0727807749 real
## 29 5 Total_income 0.0836684649 real
## 30 6 Total_income 0.1255517410 real
## 31 7 Total_income 0.0938695439 real
## 32 8 Total_income 0.0778813144 real
## 33 9 Total_income 0.0601275135 real
## 34 10 Total_income 0.0466895537 real
## 35 12 Total_income 0.0389406572 real
## 36 13 Total_income 0.0092202060 real
## 37 14 Total_income 0.0890632663 real
## 38 15 Total_income 0.1319274154 real
## 39 77 Total_income 0.0181461501 real
## 40 99 Total_income 0.0175576263 real
## 41 0 Gender 0.4863168220 real
## 42 1 Gender 0.5136831780 real
## Category Var Proportion Data
## 1 1 Race 0.177 stat
## 2 2 Race 0.476 stat
## 3 3 Race 0.299 stat
## 4 4 Race 0.048 stat
## 5 1 Education 0.296 stat
## 6 2 Education 0.242 stat
## 7 3 Education 0.251 stat
## 8 4 Education 0.210 stat
## 9 999 Education 0.001 stat
## 10 1 Marriage 0.526 stat
## 11 2 Marriage 0.096 stat
## 12 3 Marriage 0.108 stat
## 13 4 Marriage 0.035 stat
## 14 5 Marriage 0.169 stat
## 15 6 Marriage 0.066 stat
## 16 1 Total_household 0.138 stat
## 17 2 Total_household 0.313 stat
## 18 3 Total_household 0.162 stat
## 19 4 Total_household 0.175 stat
## 20 5 Total_household 0.094 stat
## 21 6 Total_household 0.050 stat
## 22 7 Total_household 0.068 stat
## 23 1 Total_income 0.025 stat
## 24 2 Total_income 0.040 stat
## 25 3 Total_income 0.074 stat
## 26 4 Total_income 0.073 stat
## 27 5 Total_income 0.086 stat
## 28 6 Total_income 0.123 stat
## 29 7 Total_income 0.078 stat
## 30 8 Total_income 0.074 stat
## 31 9 Total_income 0.056 stat
## 32 10 Total_income 0.048 stat
## 33 12 Total_income 0.037 stat
## 34 13 Total_income 0.007 stat
## 35 14 Total_income 0.093 stat
## 36 15 Total_income 0.136 stat
## 37 77 Total_income 0.028 stat
## 38 99 Total_income 0.022 stat
## 39 0 Gender 0.508 stat
## 40 1 Gender 0.492 stat
## Category Var Proportion Data
## 1 1 Race 0.142 ctgan
## 2 2 Race 0.351 ctgan
## 3 3 Race 0.414 ctgan
## 4 4 Race 0.093 ctgan
## 5 1 Education 0.316 ctgan
## 6 2 Education 0.214 ctgan
## 7 3 Education 0.256 ctgan
## 8 4 Education 0.200 ctgan
## 9 999 Education 0.014 ctgan
## 10 1 Marriage 0.300 ctgan
## 11 2 Marriage 0.100 ctgan
## 12 3 Marriage 0.205 ctgan
## 13 4 Marriage 0.063 ctgan
## 14 5 Marriage 0.192 ctgan
## 15 6 Marriage 0.127 ctgan
## 16 77 Marriage 0.008 ctgan
## 17 99 Marriage 0.005 ctgan
## 18 1 Total_household 0.102 ctgan
## 19 2 Total_household 0.207 ctgan
## 20 3 Total_household 0.205 ctgan
## 21 4 Total_household 0.205 ctgan
## 22 5 Total_household 0.087 ctgan
## 23 6 Total_household 0.060 ctgan
## 24 7 Total_household 0.134 ctgan
## 25 1 Total_income 0.006 ctgan
## 26 2 Total_income 0.011 ctgan
## 27 3 Total_income 0.043 ctgan
## 28 4 Total_income 0.094 ctgan
## 29 5 Total_income 0.089 ctgan
## 30 6 Total_income 0.080 ctgan
## 31 7 Total_income 0.076 ctgan
## 32 8 Total_income 0.067 ctgan
## 33 9 Total_income 0.060 ctgan
## 34 10 Total_income 0.040 ctgan
## 35 11 Total_income 0.029 ctgan
## 36 12 Total_income 0.020 ctgan
## 37 13 Total_income 0.007 ctgan
## 38 14 Total_income 0.092 ctgan
## 39 15 Total_income 0.208 ctgan
## 40 16 Total_income 0.020 ctgan
## 41 71 Total_income 0.002 ctgan
## 42 72 Total_income 0.001 ctgan
## 43 73 Total_income 0.002 ctgan
## 44 74 Total_income 0.003 ctgan
## 45 75 Total_income 0.003 ctgan
## 46 76 Total_income 0.009 ctgan
## 47 77 Total_income 0.006 ctgan
## 48 78 Total_income 0.007 ctgan
## 49 79 Total_income 0.007 ctgan
## 50 80 Total_income 0.003 ctgan
## 51 81 Total_income 0.002 ctgan
## 52 82 Total_income 0.003 ctgan
## 53 83 Total_income 0.005 ctgan
## 54 93 Total_income 0.002 ctgan
## 55 99 Total_income 0.003 ctgan
## 56 0 Gender 0.353 ctgan
## 57 1 Gender 0.647 ctgan
# Plot
prop_df1 <- get_category_proportions(vitd1,"real")
prop_df2 <- get_category_proportions(vitd1_new1,"stat")
prop_df3 <- get_category_proportions(vitd1_ctgan1,"ctgan")
# Combine proportions from all data frames
combined_prop_df <- bind_rows(prop_df1, prop_df2, prop_df3)
# Plot
ggplot(combined_prop_df, aes(x = Category, y = Proportion, fill = Data)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of Category Proportions",
x = "Category",
y = "Proportion",
fill = "Data Frame") +
theme_minimal()+
facet_wrap(~ Var, scales = "free")cat_plot=ggplot(combined_prop_df, aes(x = Category, y = Proportion, fill = Data)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of Category Proportions",
x = "Category",
y = "Proportion",
fill = "Data Frame") +
theme_minimal()+
facet_wrap(~ Var, scales = "free")
#ggsave("vitd_cat_prop.pdf",cat_plot,height = 10,width = 15)
# correlation plot
variables=c("RIDAGEYR", "Calcium" ,"Creatinine" , "Glucose_serum" , "Iron" , "Sodium" , "protein" , "Race", "Education" , "Marriage" ,"Total_household" ,"Total_income" , "Gender")
cor1 <- cor(as.matrix(vitd1[,variables]),use = "pairwise.complete.obs")
cor2 <- cor(as.matrix(vitd1_new1[,variables]),use = "pairwise.complete.obs")
cor3 <- cor(as.matrix(vitd1_ctgan1[,variables]),use = "pairwise.complete.obs")
par(mfrow = c(1, 3))
p1=corrplot::corrplot(cor1,tl.col = "black",col = colorRampPalette(c("blue", "white", "red"))(100))
p2=corrplot::corrplot(cor2,tl.col = "black",col = colorRampPalette(c("blue", "white", "red"))(100))
p3=corrplot::corrplot(cor3,tl.col = "black",col = colorRampPalette(c("blue", "white", "red"))(100)) #save as vitd_correlation
# Melt the correlation matrix
melted_cor_matrix <- melt(cor1)
melted_cor_matrix$shape <- ifelse(melted_cor_matrix$value > 0, "Positive", "Negative")
# Create the plot
g1=ggplot(data = melted_cor_matrix, aes(x = Var1, y = Var2, size = abs(value), shape = shape)) +
geom_point(color = "black") +
scale_size(range = c(1, 10),name = "Correlation Value") + # Adjust the size range as needed
scale_shape_manual(values = c("Positive" = 16, "Negative" = 15)) + # 16 for circle, 15 for square
theme_minimal() +
labs(title = "Correlation Heatmap", x = "Variables", y = "Variables", shape = "Correlation Type") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+
coord_fixed(ratio = 1)
melted_cor_matrix <- melt(cor2)
melted_cor_matrix$shape <- ifelse(melted_cor_matrix$value > 0, "Positive", "Negative")
# Create the plot
g2=ggplot(data = melted_cor_matrix, aes(x = Var1, y = Var2, size = abs(value), shape = shape)) +
geom_point(color = "black") +
scale_size(range = c(1, 10),name = "Correlation Value") + # Adjust the size range as needed
scale_shape_manual(values = c("Positive" = 16, "Negative" = 15)) + # 16 for circle, 15 for square
theme_minimal() +
labs(title = "Correlation Heatmap", x = "Variables", y = "Variables", shape = "Correlation Type") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+
coord_fixed(ratio = 1)
melted_cor_matrix <- melt(cor3)
melted_cor_matrix$shape <- ifelse(melted_cor_matrix$value > 0, "Positive", "Negative")
# Create the plot
g3=ggplot(data = melted_cor_matrix, aes(x = Var1, y = Var2, size = abs(value), shape = shape)) +
geom_point(color = "black") +
scale_size(range = c(1, 10),name = "Correlation Value") + # Adjust the size range as needed
scale_shape_manual(values = c("Positive" = 16, "Negative" = 15)) + # 16 for circle, 15 for square
theme_minimal() +
labs(title = "Correlation Heatmap", x = "Variables", y = "Variables", shape = "Correlation Type") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+
coord_fixed(ratio = 1)
ff=grid.arrange(g1,g2,g3,ncol=3) #ggsave("vitd_correlation2.pdf",ff,height = 20,width = 20)
# 2d density plot
g1=ggplot(vitd1, aes(x = Marriage, y = RIDAGEYR)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "Marriage", y = "age", title = "Real")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(0,6))+ylim(c(-2,2))
g2=ggplot(vitd1_new1, aes(x = Marriage, y = RIDAGEYR)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "Marriage", y = "age", title = "Stat")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(0,6))+ylim(c(-2,2))
g3=ggplot(vitd1_ctgan1, aes(x = Marriage, y = RIDAGEYR)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "Marriage", y = "age", title = "CTGAN")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(0,6))+ylim(c(-2,2))
g4=ggplot(vitd1, aes(x = Calcium, y = protein)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "Calcium", y = "protein", title = "Real")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-4,4))+ylim(c(-4,4))
g5=ggplot(vitd1_new1, aes(x = Calcium, y = protein)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "Calcium", y = "protein", title = "Stat")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-4,4))+ylim(c(-4,4))
g6=ggplot(vitd1_ctgan1, aes(x = Calcium, y = protein)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "Calcium", y = "protein", title = "CTGAN")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-4,4))+ylim(c(-4,4))
g7=ggplot(vitd1, aes(x = vitD, y = RIDAGEYR)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "vitD", y = "RIDAGEYR", title = "Real")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-2.5,3))+ylim(c(-2,2))
g8=ggplot(vitd1_new1, aes(x = vitD, y = RIDAGEYR)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "vitD", y = "RIDAGEYR", title = "Stat")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-2.5,3))+ylim(c(-2,2))
g9=ggplot(vitd1_ctgan1, aes(x = vitD, y = RIDAGEYR)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "vitD", y = "RIDAGEYR", title = "CTGAN")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-2.5,3))+ylim(c(-2,2))
ff=grid.arrange(g1,g2,g3,g4,g5,g6,g7,g8,g9,ncol=3)heart=read_csv("heart.csv")
heart1=heart[,colnames(heart)%in%c("age","target")]
#write_csv(heart1,"heart1.csv")
heart2=heart[,colnames(heart)%in%c("age","cp","trestbps" ,"chol","thalach","oldpeak","ca","target")]
covariates=heart2[,!colnames(heart2)=="target"]
#scaled_covariates <- apply(covariates, 2, function(x) (x - min(x)) / (max(x) - min(x)))
scaled_covariates <- apply(covariates, 2, function(x) scale(x))
heart2[,colnames(heart2)[1:7]]=scaled_covariates
#write_csv(heart2,"heart2.csv")
heart3=heart
covariates=heart3[,colnames(heart3)%in%c("age","cp","trestbps" ,"chol","thalach","oldpeak","ca")]
#scaled_covariates <- apply(covariates, 2, function(x) (x - min(x)) / (max(x) - min(x)))
scaled_covariates <- apply(covariates, 2, function(x) scale(x))
heart3[,colnames(heart3)%in%c("age","cp","trestbps" ,"chol","thalach","oldpeak","ca")]=scaled_covariates
#write_csv(heart3,"heart3.csv")heart3=read.csv("heart3.csv")
heartdt=heart3
modgo_dt=heartdt[,!colnames(heartdt)=="target"]
colnames(modgo_dt)## [1] "age" "sex" "cp" "trestbps" "chol" "fbs"
## [7] "restecg" "thalach" "exang" "oldpeak" "slope" "ca"
## [13] "thal"
modgo_dt=as.data.frame(modgo_dt)
#modgo_dt$sex=as.integer(modgo_dt$sex)
binary_variables <- c("sex","fbs","exang")
categorical_variables <- c("restecg","slope","thal")
test <- modgo(data = modgo_dt,bin_variables = binary_variables,categ_variables = categorical_variables,nrep = 1,ties_method= "max", variables= colnames(modgo_dt),
n_samples= 1000,sigma= c(),
noise_mu= FALSE, pertr_vec= c(),
change_cov= c(),change_amount= 0,seed= 1,
thresh_var= c(), thresh_force = FALSE,
var_prop= c(),var_infl=c(),infl_cov_stable=FALSE)
modgo_new1=test[[1]][[1]]
#generate output var
formula_str <- paste(colnames(modgo_dt), collapse = " + ")
formula_obj <- as.formula(paste("target ~", formula_str))
lm_model <- glm(formula_obj, data = heartdt, family=binomial())
summary(lm_model)##
## Call:
## glm(formula = formula_obj, family = binomial(), data = heartdt)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.59660 0.47965 5.414 6.18e-08 ***
## age -0.07439 0.11439 -0.650 0.51551
## sex -1.84655 0.25657 -7.197 6.16e-13 ***
## cp 0.87993 0.10333 8.516 < 2e-16 ***
## trestbps -0.31948 0.09846 -3.245 0.00118 **
## chol -0.29251 0.10608 -2.757 0.00583 **
## fbs -0.10115 0.28486 -0.355 0.72251
## restecg 0.41324 0.18898 2.187 0.02877 *
## thalach 0.54353 0.13071 4.158 3.21e-05 ***
## exang -0.99079 0.22429 -4.418 9.98e-06 ***
## oldpeak -0.67064 0.13631 -4.920 8.66e-07 ***
## slope 0.53407 0.18868 2.831 0.00465 **
## ca -0.77772 0.10623 -7.321 2.45e-13 ***
## thal -0.88607 0.15563 -5.693 1.24e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1420.2 on 1024 degrees of freedom
## Residual deviance: 718.9 on 1011 degrees of freedom
## AIC: 746.9
##
## Number of Fisher Scoring iterations: 6
beta <- coef(lm_model)[-1] # Exclude intercept
# Simulate new data based on the regression model
set.seed(20240421) # Set seed for reproducibility
probs<- predict(lm_model, newdata = as.data.frame(modgo_new1), type = "response")
target <- rbinom(n = 1000, size = 1, prob = probs)
vitd1_new1=cbind.data.frame(modgo_new1,target)
#write_csv(vitd1_new1,file="heart_3_stat.csv")vitd1=read.csv("heart3.csv")
vitd1_new1=read.csv("heart_3_stat.csv")
vitd1_ctgan1=read.csv("heart3_sim1.csv")
# "age" "sex" "cp" "trestbps" "chol" "fbs" "restecg" "thalach" "exang"
# "oldpeak" "slope" "ca" "thal" "target"
# visualisations
# distribution of covaraite
# Create a data frame with the three vectors
df <- data.frame(value = c(vitd1$age, vitd1_new1$age, vitd1_ctgan1$age),
group = c(rep("Real", nrow(vitd1)),rep( "Stat", nrow(vitd1_new1)), rep("CTGAN", nrow(vitd1_ctgan1))))
# Plot the distribution curves
ggplot(df, aes(x = value, fill = group)) +
geom_density(alpha = 0.3) +
labs(title = "Distribution of age", x = "Value", y = "Density") +
scale_fill_manual(values = c("blue", "red", "green"))+theme_bw()g1=ggplot(df, aes(x = value, fill = group)) +
geom_density(alpha = 0.3) +
labs(title = "Distribution of age", x = "Value", y = "Density") +
scale_fill_manual(values = c("blue", "red", "green"))+theme_bw()
df <- data.frame(value = c(vitd1$cp, vitd1_new1$cp, vitd1_ctgan1$cp),
group = c(rep("Real", nrow(vitd1)),rep( "Stat", nrow(vitd1_new1)), rep("CTGAN", nrow(vitd1_ctgan1))))
# Plot the distribution curves
ggplot(df, aes(x = value, fill = group)) +
geom_density(alpha = 0.3) +
labs(title = "Distribution of cp", x = "Value", y = "Density") +
scale_fill_manual(values = c("blue", "red", "green"))+theme_bw()g2=ggplot(df, aes(x = value, fill = group)) +
geom_density(alpha = 0.3) +
labs(title = "Distribution of cp", x = "Value", y = "Density") +
scale_fill_manual(values = c("blue", "red", "green"))+theme_bw()
df <- data.frame(value = c(vitd1$trestbps, vitd1_new1$trestbps, vitd1_ctgan1$trestbps),
group = c(rep("Real", nrow(vitd1)),rep( "Stat", nrow(vitd1_new1)), rep("CTGAN", nrow(vitd1_ctgan1))))
# Plot the distribution curves
ggplot(df, aes(x = value, fill = group)) +
geom_density(alpha = 0.3) +
labs(title = "Distribution of trestbps", x = "Value", y = "Density") +
scale_fill_manual(values = c("blue", "red", "green"))+theme_bw()g3=ggplot(df, aes(x = value, fill = group)) +
geom_density(alpha = 0.3) +
labs(title = "Distribution of trestbps", x = "Value", y = "Density") +
scale_fill_manual(values = c("blue", "red", "green"))+theme_bw()
ff=grid.arrange(g1,g2,g3,ncol=1)#ggsave("distribution2.pdf",ff,height = 10,width = 5)
# distribution of response
# Create frequency tables
freq_df1 <- table(vitd1$target) %>% prop.table()%>% as.data.frame() %>% rename(target = Var1, Count = Freq)
freq_df1$Dataset <- "Real"
freq_df2 <- table(vitd1_new1$target) %>% prop.table()%>% as.data.frame() %>% rename(target = Var1, Count = Freq)
freq_df2$Dataset <- "Stat"
freq_df3 <- table(vitd1_ctgan1$target) %>% prop.table()%>% as.data.frame() %>% rename(target = Var1, Count = Freq)
freq_df3$Dataset <- "CTGAN"
# Combine the frequency tables
combined_df <- rbind(freq_df1, freq_df2,freq_df3)
# Plotting
ggplot(combined_df, aes(x = target, y = Count, fill = Dataset)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of target by Dataset",
x = "target",
y = "Count",
fill = "Dataset") +
theme_minimal()# model prediction and feature selection (train on synthtic and evaluation on real)(Brett K Beaulieu-Jones, Zhiwei Steven Wu, Chris Williams, Ran Lee, Sanjeev P Bhavnani,James Brian Byrd, and Casey S Greene. Privacy-preserving generative deep neural networks support clinical data sharing. Circulation: Cardiovascular Quality and Outcomes, 12(7):e005122, 2019.)
train_and_test_glm <- function(train_data, test_data) {
# Train a linear regression model on the training data
formula_str <- paste(colnames(vitd1)[1:13], collapse = " + ")
formula_obj <- as.formula(paste("target ~", formula_str))
lm_model <- glm(formula_obj, data = train_data,family = binomial())
# Make predictions on the test data using the trained model
test_predictions <- predict(lm_model, newdata = test_data,type = "response")
# Calculate the accuracy on the test data
predicted_classes <- ifelse(test_predictions > quantile(test_predictions)[3], 1, 0)
# Create confusion matrix
actual=factor(test_data$target,levels = c(0,1))
predicted=factor(predicted_classes,levels = c(0,1))
conf_matrix <- confusionMatrix(actual, predicted)
# Calculate balanced accuracy
balanced_accuracy <- conf_matrix$byClass["Balanced Accuracy"]
TP <- conf_matrix$table[2,2] # True Positive
FP <- conf_matrix$table[1,2] # False Positive
FN <- conf_matrix$table[2,1] # False Negative
# Calculate precision and recall
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
# Calculate F1 score
F1 <- 2 * precision * recall / (precision + recall)
# Return the coefficients and MSE
return(list(coefficients = coef(lm_model), accuracy = balanced_accuracy,f1=F1))
}
stat_pred=train_and_test_glm(train_data=vitd1_new1,test_data=vitd1)
ctgan_pred=train_and_test_glm(train_data=vitd1_ctgan1,test_data=vitd1)
featuredt=rbind.data.frame(stat_pred$coefficients,ctgan_pred$coefficients)
colnames(featuredt)=names(stat_pred$coefficients)
#featuredt
featuredt$accuracy=c(stat_pred[[2]],ctgan_pred[[2]])
featuredt$f1=c(stat_pred[[3]],ctgan_pred[[3]])
rownames(featuredt)=c("stat","ctgan")
featuredt## (Intercept) age sex cp trestbps chol
## stat 2.547252 0.008042631 -2.121195 1.1996785 -0.2353949 -0.4773610
## ctgan 3.464948 -0.543340147 -1.196122 0.8580528 0.1577054 0.1288896
## fbs restecg thalach exang oldpeak slope
## stat 0.1987707 0.7293565 0.6500377 -0.8725433 -0.7547665 0.3812739
## ctgan -0.7579080 -0.4652356 0.2150923 -1.2265753 -0.5988021 0.1513724
## ca thal accuracy f1
## stat -0.7128754 -0.8278048 0.8517205 0.8535645
## ctgan -0.6020966 -0.9803769 0.8244622 0.8262548
# sanity checks (https://github.com/vanderschaarlab/synthcity)(not in the paper, decided not doing here)
# also, the privacy related metrics are not used here
similarity_age=rbind.data.frame(similarity_metric(vec1=vitd1_new1$age,vec2=vitd1$age),similarity_metric(vec1=vitd1_ctgan1$age,vec2=vitd1$age))
colnames(similarity_age)=c("avg_inv_KLD","ks","JS","Wasserstein")
rownames(similarity_age)=c("stat","ctgan")
print("similarity_age")## [1] "similarity_age"
## avg_inv_KLD ks JS Wasserstein
## stat -0.57698779 0.02165854 NaN 0.0009940358
## ctgan -0.05501899 0.07653659 NaN 0.0009633911
similarity_target=rbind.data.frame(similarity_metric(vec1=vitd1_new1$target,vec2=vitd1$target),similarity_metric(vec1=vitd1_ctgan1$target,vec2=vitd1$target))
colnames(similarity_target)=c("avg_inv_KLD","ks","JS","Wasserstein")
rownames(similarity_target)=c("stat","ctgan")
print("similarity_target")## [1] "similarity_target"
## avg_inv_KLD ks JS Wasserstein
## stat 0 0.02517073 15.32971 0.0009871668
## ctgan 0 0.03817073 15.95306 0.0010000000
# Function to calculate similarity metrics for all columns in the datasets
calculate_similarity_for_all_columns <- function(dataset1, dataset2,dataset3) {
result_list <- list()
for (col in names(dataset1)) {
similarity_df <- rbind.data.frame(similarity_metric(vec1=dataset2[[col]],vec2=dataset1[[col]]),similarity_metric(vec1=dataset3[[col]],vec2=dataset1[[col]]))
colnames(similarity_df)=c("avg_inv_KLD","ks","JS","Wasserstein")
rownames(similarity_df)=c("stat","ctgan")
result_list[[col]] <- similarity_df
}
return(result_list)
}
# Call the function to get similarity results for all columns
similarity_results <- calculate_similarity_for_all_columns(vitd1,vitd1_new1,vitd1_ctgan1)
#write_xlsx(similarity_results, "heart_similarity.xlsx")
# categorical variables plot
get_category_proportions <- function(df,data_name) {
categorical_cols <- colnames(vitd1)[c(2,6,7,9,11,13)]
df[categorical_cols] <- lapply(df[categorical_cols], factor)
# Get names of categorical variables
cat_vars <- names(df)[sapply(df, is.factor)]
# Create an empty data frame to store proportions
prop_df <- data.frame(Category = character(), Proportion = numeric(), Data_Frame = character())
# Loop through each categorical variable
for (var in cat_vars) {
# Get proportions of categories
prop <- table(df[[var]]) / length(df[[var]])
# Add proportions to the data frame
prop_df <- bind_rows(prop_df, data.frame(Category = names(prop), Proportion = prop, Data_Frame = var))
}
prop_df=prop_df[,c("Category","Data_Frame","Proportion.Freq")]
colnames(prop_df)=c("Category","Var","Proportion")
prop_df$Data=data_name
return(prop_df)
}
get_category_proportions(vitd1,"real")## Category Var Proportion Data
## 1 0 sex 0.304390244 real
## 2 1 sex 0.695609756 real
## 3 0 fbs 0.850731707 real
## 4 1 fbs 0.149268293 real
## 5 0 restecg 0.484878049 real
## 6 1 restecg 0.500487805 real
## 7 2 restecg 0.014634146 real
## 8 0 exang 0.663414634 real
## 9 1 exang 0.336585366 real
## 10 0 slope 0.072195122 real
## 11 1 slope 0.470243902 real
## 12 2 slope 0.457560976 real
## 13 0 thal 0.006829268 real
## 14 1 thal 0.062439024 real
## 15 2 thal 0.530731707 real
## 16 3 thal 0.400000000 real
## Category Var Proportion Data
## 1 0 sex 0.311 stat
## 2 1 sex 0.689 stat
## 3 0 fbs 0.837 stat
## 4 1 fbs 0.163 stat
## 5 0 restecg 0.484 stat
## 6 1 restecg 0.501 stat
## 7 2 restecg 0.015 stat
## 8 0 exang 0.682 stat
## 9 1 exang 0.318 stat
## 10 0 slope 0.073 stat
## 11 1 slope 0.485 stat
## 12 2 slope 0.442 stat
## 13 0 thal 0.005 stat
## 14 1 thal 0.072 stat
## 15 2 thal 0.535 stat
## 16 3 thal 0.388 stat
## Category Var Proportion Data
## 1 0 sex 0.377 ctgan
## 2 1 sex 0.623 ctgan
## 3 0 fbs 0.724 ctgan
## 4 1 fbs 0.276 ctgan
## 5 0 restecg 0.504 ctgan
## 6 1 restecg 0.454 ctgan
## 7 2 restecg 0.042 ctgan
## 8 0 exang 0.619 ctgan
## 9 1 exang 0.381 ctgan
## 10 0 slope 0.111 ctgan
## 11 1 slope 0.501 ctgan
## 12 2 slope 0.388 ctgan
## 13 0 thal 0.021 ctgan
## 14 1 thal 0.088 ctgan
## 15 2 thal 0.401 ctgan
## 16 3 thal 0.490 ctgan
# Plot
prop_df1 <- get_category_proportions(vitd1,"real")
prop_df2 <- get_category_proportions(vitd1_new1,"stat")
prop_df3 <- get_category_proportions(vitd1_ctgan1,"ctgan")
# Combine proportions from all data frames
combined_prop_df <- bind_rows(prop_df1, prop_df2, prop_df3)
# Plot
ggplot(combined_prop_df, aes(x = Category, y = Proportion, fill = Data)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of Category Proportions",
x = "Category",
y = "Proportion",
fill = "Data Frame") +
theme_minimal()+
facet_wrap(~ Var, scales = "free")cat_prop=ggplot(combined_prop_df, aes(x = Category, y = Proportion, fill = Data)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of Category Proportions",
x = "Category",
y = "Proportion",
fill = "Data Frame") +
theme_minimal()+
facet_wrap(~ Var, scales = "free")
#ggsave("heart_cat_prop.pdf",cat_prop,height = 10,width = 15)
# correlation plot
variables=c("age","cp","trestbps" ,"chol","thalach","oldpeak","ca","target","sex","fbs","exang","restecg","slope","thal")
cor1 <- cor(as.matrix(vitd1[,variables]),use = "pairwise.complete.obs")
cor2 <- cor(as.matrix(vitd1_new1[,variables]),use = "pairwise.complete.obs")
cor3 <- cor(as.matrix(vitd1_ctgan1[,variables]),use = "pairwise.complete.obs")
par(mfrow = c(1, 3))
p1=corrplot::corrplot(cor1,tl.col = "black",col = colorRampPalette(c("blue", "white", "red"))(100))
p2=corrplot::corrplot(cor2,tl.col = "black",col = colorRampPalette(c("blue", "white", "red"))(100))
p3=corrplot::corrplot(cor3,tl.col = "black",col = colorRampPalette(c("blue", "white", "red"))(100))#save as heart_correlation
# Melt the correlation matrix
melted_cor_matrix <- melt(cor1)
melted_cor_matrix$shape <- ifelse(melted_cor_matrix$value > 0, "Positive", "Negative")
# Create the plot
g1=ggplot(data = melted_cor_matrix, aes(x = Var1, y = Var2, size = abs(value), shape = shape)) +
geom_point(color = "black") +
scale_size(range = c(1, 10),name = "Correlation Value") + # Adjust the size range as needed
scale_shape_manual(values = c("Positive" = 16, "Negative" = 15)) + # 16 for circle, 15 for square
theme_minimal() +
labs(title = "Correlation Heatmap", x = "Variables", y = "Variables", shape = "Correlation Type") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+
coord_fixed(ratio = 1)
melted_cor_matrix <- melt(cor2)
melted_cor_matrix$shape <- ifelse(melted_cor_matrix$value > 0, "Positive", "Negative")
# Create the plot
g2=ggplot(data = melted_cor_matrix, aes(x = Var1, y = Var2, size = abs(value), shape = shape)) +
geom_point(color = "black") +
scale_size(range = c(1, 10),name = "Correlation Value") + # Adjust the size range as needed
scale_shape_manual(values = c("Positive" = 16, "Negative" = 15)) + # 16 for circle, 15 for square
theme_minimal() +
labs(title = "Correlation Heatmap", x = "Variables", y = "Variables", shape = "Correlation Type") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+
coord_fixed(ratio = 1)
melted_cor_matrix <- melt(cor3)
melted_cor_matrix$shape <- ifelse(melted_cor_matrix$value > 0, "Positive", "Negative")
# Create the plot
g3=ggplot(data = melted_cor_matrix, aes(x = Var1, y = Var2, size = abs(value), shape = shape)) +
geom_point(color = "black") +
scale_size(range = c(1, 10),name = "Correlation Value") + # Adjust the size range as needed
scale_shape_manual(values = c("Positive" = 16, "Negative" = 15)) + # 16 for circle, 15 for square
theme_minimal() +
labs(title = "Correlation Heatmap", x = "Variables", y = "Variables", shape = "Correlation Type") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+
coord_fixed(ratio = 1)
ff=grid.arrange(g1,g2,g3,ncol=3) #ggsave("heart_correlation2.pdf",ff,height = 20,width = 20)
# 2d density plot
g1=ggplot(vitd1, aes(x = thalach, y = age)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "thalach", y = "age", title = "Real")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-3,3))+ylim(c(-3,3))
g2=ggplot(vitd1_new1, aes(x = thalach, y = age)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "thalach", y = "age", title = "Stat")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-3,3))+ylim(c(-3,3))
g3=ggplot(vitd1_ctgan1, aes(x = thalach, y = age)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "thalach", y = "age", title = "CTGAN")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-3,3))+ylim(c(-3,3))
g4=ggplot(vitd1, aes(x = target, y = age)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "target", y = "age", title = "Real")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(0,1))+ylim(c(0,1))
g5=ggplot(vitd1_new1, aes(x = target, y = age)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "target", y = "age", title = "Stat")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(0,1))+ylim(c(0,1))
g6=ggplot(vitd1_ctgan1, aes(x = target, y = age)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "target", y = "age", title = "CTGAN")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(0,1))+ylim(c(0,1))
g7=ggplot(vitd1, aes(x = thalach, y = exang)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "thalach", y = "exang", title = "Real")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-3,3))+ylim(c(0,1))
g8=ggplot(vitd1_new1, aes(x = thalach, y = exang)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "thalach", y = "exang", title = "Stat")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-3,3))+ylim(c(0,1))
g9=ggplot(vitd1_ctgan1, aes(x = thalach, y = exang)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "thalach", y = "exang", title = "CTGAN")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-3,3))+ylim(c(0,1))
ff=grid.arrange(g1,g2,g3,g7,g8,g9,g4,g5,g6,ncol=3)veteran=read_csv("veteran.csv")
veteran1=veteran[,colnames(veteran)%in%c("status","time","num_age")]
#write_csv(veteran1,"veteran1.csv")
veteran2=veteran[,colnames(veteran)%in%c("status","time","num_age","num_karno","num_diagtime")]
covariates=veteran2[,!colnames(veteran2)%in%c("status","time")]
#scaled_covariates <- apply(covariates, 2, function(x) (x - min(x)) / (max(x) - min(x)))
scaled_covariates <- as.data.frame(apply(covariates, 2, function(x) scale(x)))
veteran2[,colnames(veteran2)[3:5]]=scaled_covariates
#write_csv(veteran2,"veteran2.csv")
veteran3=veteran
covariates=veteran3[,colnames(veteran3)%in%c("num_age","num_karno","num_diagtime")]
#scaled_covariates <- apply(covariates, 2, function(x) (x - min(x)) / (max(x) - min(x)))
scaled_covariates <- apply(covariates, 2, function(x) scale(x))
veteran3[,colnames(veteran3)%in%c("num_age","num_karno","num_diagtime")]=scaled_covariates
veteran3$fac_trt=ifelse(veteran3$fac_trt=="standard",1,2)
veteran3$fac_celltype=ifelse(veteran3$fac_celltype=="squamous",1,ifelse(veteran3$fac_celltype=="smallcell",2,ifelse(veteran3$fac_celltype=="adeno",3,4)))
veteran3$fac_prior=ifelse(veteran3$fac_prior=="N",0,10)
#write_csv(veteran3,"veteran3.csv")modgo_dt=as.data.frame(veteran3[,!colnames(veteran3)%in%c("time","status")])
bin_variables=c("fac_trt")
categ_variables=c("fac_celltype","fac_prior")
test <- modgo(data = modgo_dt,bin_variables = bin_variables,categ_variables = categ_variables,nrep = 1,ties_method= "max", variables= colnames(modgo_dt),
n_samples= 1000,sigma= c(),
noise_mu= FALSE, pertr_vec= c(),
change_cov= c(),change_amount= 0,seed= 1,
thresh_var= c(), thresh_force = FALSE,
var_prop= c(),var_infl=c(),infl_cov_stable=FALSE)
modgo_new1=test[[1]][[1]]
veteran1_stat=sim_fun_cox(dat=veteran3,seed=20240426,n=1000,X=modgo_new1)
table(veteran1_stat$status)
table(veteran2$status)
summary(veteran1_stat$time)
summary(veteran2$time)
#write_csv(veteran1_stat,"veteran3_stat.csv")vitd1=read.csv("veteran3.csv")
vitd1_new1=read.csv("veteran3_stat.csv")
vitd1_ctgan1=read.csv("veteran3_sim1.csv")
# visualisations
# distribution of covaraite
# Create a data frame with the three vectors
df <- data.frame(value = c(vitd1$num_age, vitd1_new1$num_age, vitd1_ctgan1$num_age),
group = c(rep("Real", nrow(vitd1)),rep( "Stat", nrow(vitd1_new1)), rep("CTGAN", nrow(vitd1_ctgan1))))
# Plot the distribution curves
ggplot(df, aes(x = value, fill = group)) +
geom_density(alpha = 0.3) +
labs(title = "Distribution of age", x = "Value", y = "Density") +
scale_fill_manual(values = c("blue", "red", "green"))+theme_bw()df <- data.frame(value = c(vitd1$time, vitd1_new1$time, vitd1_ctgan1$time),
group = c(rep("Real", nrow(vitd1)),rep( "Stat", nrow(vitd1_new1)), rep("CTGAN", nrow(vitd1_ctgan1))))
# Plot the distribution curves
ggplot(df, aes(x = value, fill = group)) +
geom_density(alpha = 0.3) +
labs(title = "Distribution of time", x = "Value", y = "Density") +
scale_fill_manual(values = c("blue", "red", "green"))+theme_bw()# distribution of response
# Create frequency tables
freq_df1 <- table(vitd1$status) %>% prop.table()%>% as.data.frame() %>% rename(status = Var1, Count = Freq)
freq_df1$Dataset <- "Real"
freq_df2 <- table(vitd1_new1$status) %>% prop.table()%>% as.data.frame() %>% rename(status = Var1, Count = Freq)
freq_df2$Dataset <- "Stat"
freq_df3 <- table(vitd1_ctgan1$status) %>% prop.table()%>% as.data.frame() %>% rename(status = Var1, Count = Freq)
freq_df3$Dataset <- "CTGAN"
# Combine the frequency tables
combined_df <- rbind(freq_df1, freq_df2,freq_df3)
# Plotting
ggplot(combined_df, aes(x = status, y = Count, fill = Dataset)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of status by Dataset",
x = "status",
y = "Count",
fill = "Dataset") +
theme_minimal()similarity_age=rbind.data.frame(similarity_metric(vec1=vitd1_new1$num_age,vec2=vitd1$num_age),similarity_metric(vec1=vitd1_ctgan1$num_age,vec2=vitd1$num_age))
colnames(similarity_age)=c("avg_inv_KLD","ks","JS","Wasserstein")
rownames(similarity_age)=c("stat","ctgan")
print("similarity_age")## [1] "similarity_age"
# Call the function to get similarity results for all columns
similarity_results <- calculate_similarity_for_all_columns(vitd1,vitd1_new1,vitd1_ctgan1)
#write_xlsx(similarity_results, "veteran_similarity.xlsx")
# model prediction and feature selection (train on synthtic and evaluation on real)(Brett K Beaulieu-Jones, Zhiwei Steven Wu, Chris Williams, Ran Lee, Sanjeev P Bhavnani,James Brian Byrd, and Casey S Greene. Privacy-preserving generative deep neural networks support clinical data sharing. Circulation: Cardiovascular Quality and Outcomes, 12(7):e005122, 2019.)
train_and_test_cox <- function(train_data, test_data) {
# Train a linear regression model on the training data
formula_str <- paste(colnames(vitd1)[3:8], collapse = " + ")
formula_obj <- as.formula(paste("Surv(time,status) ~", formula_str))
coxmod <- coxph(formula_obj,x=TRUE, data = train_data)
predd<-predict(object=coxmod,newdata=test_data,type="risk")
harrelC1 <- Hmisc::rcorr.cens(-predd,with(test_data,survival::Surv(time,status)))
hc<-harrelC1["C Index"]
lp<- predict(coxmod)
lpnew <- predict(coxmod, newdata=test_data)
Surv.rsp <- survival::Surv(train_data$time, train_data$status)
Surv.rsp.new <- survival::Surv(test_data$time, test_data$status)
briers1 <- survAUC::predErr(Surv.rsp, Surv.rsp.new, lp, lpnew,times=test_data$time, type = "brier", int.type = "unweighted")$error
briers1_without_inf <- briers1[is.finite(briers1)]
bs<-sum(na.omit(briers1_without_inf))
# ibsfun1=purrr::possibly(function(modell){
# briers3 <- pec::pec(list("cox1"=modell),data=test_data,formula=formula_obj,cens.model="cox")
# return(crps(briers3)[2])
# },otherwise = NA)
# bs3<-ibsfun1(coxmod)
# Return the coefficients and MSE
return(list(coefficients = coef(coxmod), cindex = hc,Brier_score=bs))
}
stat_pred=train_and_test_cox(train_data=vitd1_new1,test_data=vitd1)
ctgan_pred=train_and_test_cox(train_data=vitd1_ctgan1,test_data=vitd1)
featuredt=rbind.data.frame(stat_pred$coefficients,ctgan_pred$coefficients)
colnames(featuredt)=names(stat_pred$coefficients)
#featuredt
featuredt$cindex=c(stat_pred[[2]],ctgan_pred[[2]])
featuredt$bs=c(stat_pred[[3]],ctgan_pred[[3]])
rownames(featuredt)=c("stat","ctgan")
featuredt## num_karno num_diagtime num_age fac_trt fac_celltype fac_prior
## stat -0.7568533 -0.043373858 -0.02391113 0.16342553 0.10015451 -0.006586734
## ctgan -0.3415392 0.005467004 0.20070092 0.07524924 0.01852984 0.014283778
## cindex bs
## stat 0.7091095 22.50444
## ctgan 0.6748069 64.50980
# categorical variables plot
get_category_proportions <- function(df,data_name) {
categorical_cols <- colnames(vitd1)[c(6:8)]
df[categorical_cols] <- lapply(df[categorical_cols], factor)
# Get names of categorical variables
cat_vars <- names(df)[sapply(df, is.factor)]
# Create an empty data frame to store proportions
prop_df <- data.frame(Category = character(), Proportion = numeric(), Data_Frame = character())
# Loop through each categorical variable
for (var in cat_vars) {
# Get proportions of categories
prop <- table(df[[var]]) / length(df[[var]])
# Add proportions to the data frame
prop_df <- bind_rows(prop_df, data.frame(Category = names(prop), Proportion = prop, Data_Frame = var))
}
prop_df=prop_df[,c("Category","Data_Frame","Proportion.Freq")]
colnames(prop_df)=c("Category","Var","Proportion")
prop_df$Data=data_name
return(prop_df)
}
get_category_proportions(vitd1,"real")## Category Var Proportion Data
## 1 1 fac_trt 0.5036496 real
## 2 2 fac_trt 0.4963504 real
## 3 1 fac_celltype 0.2554745 real
## 4 2 fac_celltype 0.3503650 real
## 5 3 fac_celltype 0.1970803 real
## 6 4 fac_celltype 0.1970803 real
## 7 0 fac_prior 0.7080292 real
## 8 10 fac_prior 0.2919708 real
## Category Var Proportion Data
## 1 1 fac_trt 0.501 stat
## 2 2 fac_trt 0.499 stat
## 3 1 fac_celltype 0.280 stat
## 4 2 fac_celltype 0.326 stat
## 5 3 fac_celltype 0.192 stat
## 6 4 fac_celltype 0.202 stat
## 7 0 fac_prior 0.707 stat
## 8 10 fac_prior 0.293 stat
## Category Var Proportion Data
## 1 1 fac_trt 0.554 ctgan
## 2 2 fac_trt 0.446 ctgan
## 3 1 fac_celltype 0.217 ctgan
## 4 2 fac_celltype 0.356 ctgan
## 5 3 fac_celltype 0.241 ctgan
## 6 4 fac_celltype 0.186 ctgan
## 7 0 fac_prior 0.654 ctgan
## 8 10 fac_prior 0.346 ctgan
# Plot
prop_df1 <- get_category_proportions(vitd1,"real")
prop_df2 <- get_category_proportions(vitd1_new1,"stat")
prop_df3 <- get_category_proportions(vitd1_ctgan1,"ctgan")
# Combine proportions from all data frames
combined_prop_df <- bind_rows(prop_df1, prop_df2, prop_df3)
# Plot
ggplot(combined_prop_df, aes(x = Category, y = Proportion, fill = Data)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of Category Proportions",
x = "Category",
y = "Proportion",
fill = "Data Frame") +
theme_minimal()+
facet_wrap(~ Var, scales = "free")cat_plot=ggplot(combined_prop_df, aes(x = Category, y = Proportion, fill = Data)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of Category Proportions",
x = "Category",
y = "Proportion",
fill = "Data Frame") +
theme_minimal()+
facet_wrap(~ Var, scales = "free")
#ggsave("veteran_cat_prop.pdf",cat_plot,height=5,width = 10)
variables=c("num_karno","num_diagtime" ,"num_age", "fac_trt","fac_celltype","fac_prior")
cor1 <- cor(as.matrix(vitd1[,variables]),use = "pairwise.complete.obs")
cor2 <- cor(as.matrix(vitd1_new1[,variables]),use = "pairwise.complete.obs")
cor3 <- cor(as.matrix(vitd1_ctgan1[,variables]),use = "pairwise.complete.obs")
par(mfrow = c(1, 3))
p1=corrplot::corrplot(cor1,tl.col = "black",col = colorRampPalette(c("blue", "white", "red"))(100))
p2=corrplot::corrplot(cor2,tl.col = "black",col = colorRampPalette(c("blue", "white", "red"))(100))
p3=corrplot::corrplot(cor3,tl.col = "black",col = colorRampPalette(c("blue", "white", "red"))(100)) #save as veteran_correlation
# Melt the correlation matrix
melted_cor_matrix <- melt(cor1)
melted_cor_matrix$shape <- ifelse(melted_cor_matrix$value > 0, "Positive", "Negative")
# Create the plot
g1=ggplot(data = melted_cor_matrix, aes(x = Var1, y = Var2, size = abs(value), shape = shape)) +
geom_point(color = "black") +
scale_size(range = c(1, 10),name = "Correlation Value") + # Adjust the size range as needed
scale_shape_manual(values = c("Positive" = 16, "Negative" = 15)) + # 16 for circle, 15 for square
theme_minimal() +
labs(title = "Correlation Heatmap", x = "Variables", y = "Variables", shape = "Correlation Type") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+
coord_fixed(ratio = 1)
melted_cor_matrix <- melt(cor2)
melted_cor_matrix$shape <- ifelse(melted_cor_matrix$value > 0, "Positive", "Negative")
# Create the plot
g2=ggplot(data = melted_cor_matrix, aes(x = Var1, y = Var2, size = abs(value), shape = shape)) +
geom_point(color = "black") +
scale_size(range = c(1, 10),name = "Correlation Value") + # Adjust the size range as needed
scale_shape_manual(values = c("Positive" = 16, "Negative" = 15)) + # 16 for circle, 15 for square
theme_minimal() +
labs(title = "Correlation Heatmap", x = "Variables", y = "Variables", shape = "Correlation Type") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+
coord_fixed(ratio = 1)
melted_cor_matrix <- melt(cor3)
melted_cor_matrix$shape <- ifelse(melted_cor_matrix$value > 0, "Positive", "Negative")
# Create the plot
g3=ggplot(data = melted_cor_matrix, aes(x = Var1, y = Var2, size = abs(value), shape = shape)) +
geom_point(color = "black") +
scale_size(range = c(1, 10),name = "Correlation Value") + # Adjust the size range as needed
scale_shape_manual(values = c("Positive" = 16, "Negative" = 15)) + # 16 for circle, 15 for square
theme_minimal() +
labs(title = "Correlation Heatmap", x = "Variables", y = "Variables", shape = "Correlation Type") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))+
coord_fixed(ratio = 1)
ff=grid.arrange(g1,g2,g3,ncol=3) #ggsave("veteran_correlation2.pdf",ff,height = 20,width = 20)
# 2d density plot
g1=ggplot(vitd1, aes(x = num_karno, y = fac_prior)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "num_karno", y = "fac_prior", title = "Real")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-3,2))+ylim(c(0,10))
g2=ggplot(vitd1_new1, aes(x = num_karno, y = fac_prior)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "num_karno", y = "fac_prior", title = "Stat")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-3,2))+ylim(c(0,10))
g3=ggplot(vitd1_ctgan1, aes(x = num_karno, y = fac_prior)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "num_karno", y = "fac_prior", title = "CTGAN")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-3,2))+ylim(c(0,10))
g4=ggplot(vitd1, aes(x = num_karno, y = num_age)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "num_karno", y = "num_age", title = "Real")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-3,3))+ylim(c(-3,3))
g5=ggplot(vitd1_new1, aes(x = num_karno, y = num_age)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "num_karno", y = "num_age", title = "Stat")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-3,3))+ylim(c(-3,3))
g6=ggplot(vitd1_ctgan1, aes(x = num_karno, y = num_age)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "num_karno", y = "num_age", title = "CTGAN")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-3,3))+ylim(c(-3,3))
g7=ggplot(vitd1, aes(x = fac_celltype, y = fac_prior)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "fac_celltype", y = "fac_prior", title = "Real")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(1,4))+ylim(c(0,10))
g8=ggplot(vitd1_new1, aes(x = fac_celltype, y = fac_prior)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "fac_celltype", y = "fac_prior", title = "Stat")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(1,4))+ylim(c(0,10))
g9=ggplot(vitd1_ctgan1, aes(x = fac_celltype, y = fac_prior)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "fac_celltype", y = "fac_prior", title = "CTGAN")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(1,4))+ylim(c(0,10))
g10=ggplot(vitd1, aes(x = num_diagtime, y = num_age)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "num_diagtime", y = "num_age", title = "Real")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(0,6))+ylim(c(-3,3))
g11=ggplot(vitd1_new1, aes(x = num_diagtime, y = num_age)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "num_diagtime", y = "num_age", title = "Stat")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(0,6))+ylim(c(-3,3))
g12=ggplot(vitd1_ctgan1, aes(x = num_diagtime, y = num_age)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "num_diagtime", y = "num_age", title = "CTGAN")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(0,6))+ylim(c(-3,3))
g13=ggplot(vitd1, aes(x = num_karno, y = num_diagtime)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "num_karno", y = "num_diagtime", title = "Real")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-3,2))+ylim(c(0,8))
g14=ggplot(vitd1_new1, aes(x = num_karno, y = num_diagtime)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "num_karno", y = "num_diagtime", title = "Stat")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-3,2))+ylim(c(0,8))
g15=ggplot(vitd1_ctgan1, aes(x = num_karno, y = num_diagtime)) +
geom_density_2d(aes(color = ..level..), bins = 50, bw = 0.1) +geom_point(alpha = 0.3,size=0.3)+
labs(x = "num_karno", y = "num_diagtime", title = "CTGAN")+theme_bw()+theme(aspect.ratio = 1)+scale_color_gradient(low = "lightblue", high = "darkblue", limits = c(0, 0.2))+xlim(c(-3,2))+ylim(c(0,8))
ff=grid.arrange(g1,g2,g3,g4,g5,g6,g7,g8,g9,g10,g11,g12,g13,g14,g15,ncol=3)vitd_sim=function(seed,data){
vitd1=data
modgo_dt=as.data.frame(vitd1[,!colnames(vitd1)=="vitD"])
# library("modgo")
binary_variables <- colnames(modgo_dt)[13]
categorical_variables <- colnames(modgo_dt)[8:12]
test <- modgo(data = modgo_dt,bin_variables = binary_variables,categ_variables = categorical_variables,nrep = 1,ties_method= "max", variables= colnames(modgo_dt),
n_samples= 1000,sigma= c(),
noise_mu= FALSE, pertr_vec= c(),
change_cov= c(),change_amount= 0,seed= seed,
thresh_var= c(), thresh_force = FALSE,
var_prop= c(),var_infl=c(),infl_cov_stable=FALSE)
modgo_new1=test[[1]][[1]]
#generate output var
formula_str <- paste(colnames(modgo_dt), collapse = " + ")
formula_obj <- as.formula(paste("vitD ~", formula_str))
lm_model <- lm(formula_obj, data = vitd1)
summary(lm_model)
beta <- coef(lm_model)[-1] # Exclude intercept
# Simulate new data based on the regression model
set.seed(seed) # Set seed for reproducibility
vitD <- as.matrix(modgo_new1) %*% as.matrix(beta) + rnorm(1000,sd = 1)
vitd1_new1=cbind.data.frame(vitD,modgo_new1)
return(vitd1_new1)
}
start_time <- Sys.time()
result1=pbmcapply::pbmclapply(1:100, vitd_sim,data=read_csv("vitd_3_scaled_all.csv"),mc.cores = 15)
end_time <- Sys.time()
runtime <- end_time - start_time
runtime
saveRDS(result1,"vitd_stat_100.rds")
heart_sim=function(seed,data){
heartdt=data
modgo_dt=as.data.frame(heartdt[,!colnames(heartdt)=="target"])
colnames(modgo_dt)
modgo_dt=as.data.frame(modgo_dt)
#modgo_dt$sex=as.integer(modgo_dt$sex)
binary_variables <- c("sex","fbs","exang")
categorical_variables <- c("restecg","slope","thal")
test <- modgo(data = modgo_dt,bin_variables = binary_variables,categ_variables = categorical_variables,nrep = 1,ties_method= "max", variables= colnames(modgo_dt),
n_samples= 1000,sigma= c(),
noise_mu= FALSE, pertr_vec= c(),
change_cov= c(),change_amount= 0,seed= seed,
thresh_var= c(), thresh_force = FALSE,
var_prop= c(),var_infl=c(),infl_cov_stable=FALSE)
modgo_new1=test[[1]][[1]]
#generate output var
formula_str <- paste(colnames(modgo_dt), collapse = " + ")
formula_obj <- as.formula(paste("target ~", formula_str))
lm_model <- glm(formula_obj, data = heartdt, family=binomial())
summary(lm_model)
beta <- coef(lm_model)[-1] # Exclude intercept
# Simulate new data based on the regression model
set.seed(seed) # Set seed for reproducibility
probs<- predict(lm_model, newdata = as.data.frame(modgo_new1), type = "response")
target <- rbinom(n = 1000, size = 1, prob = probs)
vitd1_new1=cbind.data.frame(modgo_new1,target)
return(vitd1_new1)
}
start_time <- Sys.time()
result1=pbmcapply::pbmclapply(1:100, heart_sim,data=read_csv("heart3.csv"),mc.cores = 15)
end_time <- Sys.time()
runtime <- end_time - start_time
runtime
saveRDS(result1,"heart_stat_100.rds")
veteran_sim=function(seed,data){
veteran3=data
modgo_dt=as.data.frame(veteran3[,!colnames(veteran3)%in%c("time","status")])
#bin_variables=c("fac_trt")
categ_variables=c("fac_trt","fac_celltype","fac_prior")
test <- modgo(data = modgo_dt,bin_variables = c(),categ_variables = categ_variables,nrep = 1,ties_method= "max", variables= colnames(modgo_dt),
n_samples= 1000,sigma= c(),
noise_mu= FALSE, pertr_vec= c(),
change_cov= c(),change_amount= 0,seed= seed,
thresh_var= c(), thresh_force = FALSE,
var_prop= c(),var_infl=c(),infl_cov_stable=FALSE)
modgo_new1=test[[1]][[1]]
veteran1_stat=sim_fun_cox(dat=veteran3,seed=seed,n=1000,X=modgo_new1)
return(veteran1_stat)
}
start_time <- Sys.time()
result1=pbmcapply::pbmclapply(1:100, veteran_sim,data=read_csv("veteran3.csv"),mc.cores = 15)
end_time <- Sys.time()
runtime <- end_time - start_time
runtime
saveRDS(result1,"veteran_stat_100.rds")train_and_test_lm <- function(train_data, test_data) {
# Train a linear regression model on the training data
formula_str <- paste(colnames(test_data)[c(1,3:14)], collapse = " + ")
formula_obj <- as.formula(paste("vitD ~", formula_str))
lm_model <- lm(formula_obj, data = train_data)
# Make predictions on the test data using the trained model
test_predictions <- predict(lm_model, newdata = test_data)
# Calculate the mean squared error (MSE) on the test data
mse <- mean((test_data$vitD - test_predictions)^2)
# Return the coefficients and MSE
return(list(coefficients = coef(lm_model), mse = mse))
}
list_of_dataframes=readRDS("vitd_stat_100.rds")
test_data=read_csv("vitd_3_scaled_all.csv")
results_list=list()
for(i in 1:100){
results_list[[i]]= train_and_test_lm(list_of_dataframes[[i]], test_data)
}
# train_data=list_of_dataframes[[1]]
# train_and_test_lm(list_of_dataframes[[2]], test_data)
# Combine the results into a single dataframe
results_df <- do.call(rbind, lapply(results_list, function(result) result$coefficients))
results_df2 <- do.call(rbind, lapply(results_list, function(result) result$mse))
colnames(results_df2)="mse"
df=cbind.data.frame(results_df,results_df2)
# Function to calculate mean and standard deviation for each column
calculate_stats <- function(df) {
# Calculate mean and standard deviation for each column
means <- colMeans(df, na.rm = TRUE)
sds <- apply(df, 2, sd, na.rm = TRUE)
# Combine means and standard deviations into a dataframe
stats_df <- data.frame(Column = names(means), Mean = means, SD = sds)
return(stats_df)
}
# Apply the function to your dataframe
stats_df <- calculate_stats(df)
ll=list(as.data.frame(results_df),as.data.frame(results_df2),stats_df)
# Write the results to an Excel file
write_xlsx(ll, "vitd_stats_prediction.xlsx")
###############################
##################################
train_and_test_glm <- function(train_data, test_data) {
# Train a linear regression model on the training data
formula_str <- paste(colnames(test_data)[1:13], collapse = " + ")
formula_obj <- as.formula(paste("target ~", formula_str))
lm_model <- glm(formula_obj, data = train_data,family = binomial())
# Make predictions on the test data using the trained model
test_predictions <- predict(lm_model, newdata = test_data,type = "response")
# Calculate the accuracy on the test data
predicted_classes <- ifelse(test_predictions > quantile(test_predictions)[3], 1, 0)
# Create confusion matrix
actual=factor(test_data$target,levels = c(0,1))
predicted=factor(predicted_classes,levels = c(0,1))
conf_matrix <- confusionMatrix(actual, predicted)
# Calculate balanced accuracy
balanced_accuracy <- conf_matrix$byClass["Balanced Accuracy"]
TP <- conf_matrix$table[2,2] # True Positive
FP <- conf_matrix$table[1,2] # False Positive
FN <- conf_matrix$table[2,1] # False Negative
# Calculate precision and recall
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
# Calculate F1 score
F1 <- 2 * precision * recall / (precision + recall)
# Return the coefficients and MSE
return(list(coefficients = coef(lm_model), accuracy = balanced_accuracy,f1=F1))
}
list_of_dataframes=readRDS("heart_stat_100.rds")
test_data=read_csv("heart3.csv")
results_list=list()
for(i in 1:100){
results_list[[i]]= train_and_test_glm(list_of_dataframes[[i]], test_data)
}
# train_data=list_of_dataframes[[1]]
# train_and_test_lm(list_of_dataframes[[2]], test_data)
# Combine the results into a single dataframe
results_df <- do.call(rbind, lapply(results_list, function(result) result$coefficients))
results_df2 <- do.call(rbind, lapply(results_list, function(result) result$accuracy))
colnames(results_df2)="accuracy"
results_df3 <- do.call(rbind, lapply(results_list, function(result) result$f1))
colnames(results_df3)="f1"
df=cbind.data.frame(results_df,results_df2,results_df3)
# Function to calculate mean and standard deviation for each column
calculate_stats <- function(df) {
# Calculate mean and standard deviation for each column
means <- colMeans(df, na.rm = TRUE)
sds <- apply(df, 2, sd, na.rm = TRUE)
# Combine means and standard deviations into a dataframe
stats_df <- data.frame(Column = names(means), Mean = means, SD = sds)
return(stats_df)
}
# Apply the function to your dataframe
stats_df <- calculate_stats(df)
ll=list(as.data.frame(results_df),as.data.frame(results_df2),as.data.frame(results_df3),stats_df)
# Write the results to an Excel file
write_xlsx(ll, "heart_stats_prediction.xlsx")
train_and_test_cox <- function(train_data, test_data) {
# Train a linear regression model on the training data
formula_str <- paste(colnames(test_data)[3:8], collapse = " + ")
formula_obj <- as.formula(paste("Surv(time,status) ~", formula_str))
coxmod <- coxph(formula_obj,x=TRUE, data = train_data)
predd<-predict(object=coxmod,newdata=test_data,type="risk")
harrelC1 <- Hmisc::rcorr.cens(-predd,with(test_data,survival::Surv(time,status)))
hc<-harrelC1["C Index"]
lp<- predict(coxmod)
lpnew <- predict(coxmod, newdata=test_data)
Surv.rsp <- survival::Surv(train_data$time, train_data$status)
Surv.rsp.new <- survival::Surv(test_data$time, test_data$status)
briers1 <- survAUC::predErr(Surv.rsp, Surv.rsp.new, lp, lpnew,times=test_data$time, type = "brier", int.type = "unweighted")$error
briers1_without_inf <- briers1[is.finite(briers1)]
bs<-sum(na.omit(briers1_without_inf))
# ibsfun1=purrr::possibly(function(modell){
# briers3 <- pec::pec(list("cox1"=modell),data=test_data,formula=formula_obj,cens.model="cox")
# return(crps(briers3)[2])
# },otherwise = NA)
# bs3<-ibsfun1(coxmod)
# Return the coefficients and MSE
return(list(coefficients = coef(coxmod), cindex = hc,Brier_score=bs))
}
list_of_dataframes=readRDS("veteran_stat_100.rds")
test_data=read_csv("veteran3.csv")
results_list=list()
for(i in 1:100){
results_list[[i]]= train_and_test_cox(list_of_dataframes[[i]], test_data)
}
# train_data=list_of_dataframes[[1]]
# train_and_test_lm(list_of_dataframes[[2]], test_data)
# Combine the results into a single dataframe
results_df <- do.call(rbind, lapply(results_list, function(result) result$coefficients))
results_df2 <- do.call(rbind, lapply(results_list, function(result) result$cindex))
colnames(results_df2)="cindex"
results_df3 <- do.call(rbind, lapply(results_list, function(result) result$Brier_score))
colnames(results_df3)="Brier_score"
df=cbind.data.frame(results_df,results_df2,results_df3)
# Function to calculate mean and standard deviation for each column
calculate_stats <- function(df) {
# Calculate mean and standard deviation for each column
means <- colMeans(df, na.rm = TRUE)
sds <- apply(df, 2, sd, na.rm = TRUE)
# Combine means and standard deviations into a dataframe
stats_df <- data.frame(Column = names(means), Mean = means, SD = sds)
return(stats_df)
}
# Apply the function to your dataframe
stats_df <- calculate_stats(df)
ll=list(as.data.frame(results_df),as.data.frame(results_df2),as.data.frame(results_df3),stats_df)
# Write the results to an Excel file
write_xlsx(ll, "veteran_stats_prediction.xlsx")# List all .csv files in the folder and its subfolders
#csv_files <- list.files(path = "/dskh/nobackup/yunwei/aigan_simulation/veteran/veteran3", pattern = "\\.csv$", recursive = TRUE, full.names = TRUE)
# Create an empty list to store the results
results_list <- list()
# Iterate over each .csv file
for (file in csv_files) {
# Read the .csv file into R
df <- read.csv(file)
# Store the result in the results list
results_list[[file]] <- df
}
length(results_list)
#results_list[[1]]
list_of_dataframes=results_list
test_data=read_csv("veteran3.csv")
results_list=list()
for(i in 1:100){
results_list[[i]]= train_and_test_cox(list_of_dataframes[[i]], test_data)
}
# Combine the results into a single dataframe
results_df <- do.call(rbind, lapply(results_list, function(result) result$coefficients))
results_df2 <- do.call(rbind, lapply(results_list, function(result) result$cindex))
colnames(results_df2)="cindex"
results_df3 <- do.call(rbind, lapply(results_list, function(result) result$Brier_score))
colnames(results_df3)="Brier_score"
df=cbind.data.frame(results_df,results_df2,results_df3)
# Apply the function to your dataframe
stats_df <- calculate_stats(df)
ll=list(as.data.frame(results_df),as.data.frame(results_df2),as.data.frame(results_df3),stats_df)
# Write the results to an Excel file
write_xlsx(ll, "veteran_ctgan_prediction.xlsx")
# Function to calculate mean and standard deviation for each column
calculate_stats <- function(df) {
# Check if the data frame has multiple columns
# Subset the data frame to include only values less than 1000
subset_df <- as.data.frame(df[which(df$Brier_score < 1000),])
# Calculate mean and standard deviation for each column
means <- colMeans(subset_df, na.rm = TRUE)
sds <- apply(subset_df, 2, sd, na.rm = TRUE)
# Combine means and standard deviations into a dataframe
stats_df <- data.frame(Column = names(means), Mean = means, SD = sds)
return(stats_df)
}
stats_df <- calculate_stats(df)
ll=list(as.data.frame(results_df),as.data.frame(results_df2),as.data.frame(results_df3),stats_df)
# Write the results to an Excel file
write_xlsx(ll, "veteran_ctgan_prediction2.xlsx")
# List all .csv files in the folder and its subfolders
#csv_files <- list.files(path = "/dskh/nobackup/yunwei/aigan_simulation/heart/heart3", pattern = "\\.csv$", recursive = TRUE, full.names = TRUE)
# Create an empty list to store the results
results_list <- list()
# Iterate over each .csv file
for (file in csv_files) {
# Read the .csv file into R
df <- read.csv(file)
# Store the result in the results list
results_list[[file]] <- df
}
length(results_list)
#results_list[[1]]
list_of_dataframes=results_list
test_data=read_csv("heart3.csv")
results_list=list()
for(i in 1:100){
results_list[[i]]= train_and_test_glm(list_of_dataframes[[i]], test_data)
}
# Combine the results into a single dataframe
results_df <- do.call(rbind, lapply(results_list, function(result) result$coefficients))
results_df2 <- do.call(rbind, lapply(results_list, function(result) result$accuracy))
colnames(results_df2)="accuracy"
results_df3 <- do.call(rbind, lapply(results_list, function(result) result$f1))
colnames(results_df3)="f1"
df=cbind.data.frame(results_df,results_df2,results_df3)
# Function to calculate mean and standard deviation for each column
calculate_stats <- function(df) {
# Calculate mean and standard deviation for each column
means <- colMeans(df, na.rm = TRUE)
sds <- apply(df, 2, sd, na.rm = TRUE)
# Combine means and standard deviations into a dataframe
stats_df <- data.frame(Column = names(means), Mean = means, SD = sds)
return(stats_df)
}
# Apply the function to your dataframe
stats_df <- calculate_stats(df)
ll=list(as.data.frame(results_df),as.data.frame(results_df2),as.data.frame(results_df3),stats_df)
# Write the results to an Excel file
write_xlsx(ll, "heart_ctgan_prediction.xlsx")
# List all .csv files in the folder and its subfolders
#csv_files <- list.files(path = "/dskh/nobackup/yunwei/aigan_simulation/vitd3", pattern = "\\.csv$", recursive = TRUE, full.names = TRUE)
# Create an empty list to store the results
results_list <- list()
# Iterate over each .csv file
for (file in csv_files) {
# Read the .csv file into R
df <- read.csv(file)
# Store the result in the results list
results_list[[file]] <- df
}
length(results_list)
#results_list[[1]]
list_of_dataframes=results_list
test_data=read_csv("vitd_3_scaled_all.csv")
results_list=list()
for(i in 1:length(list_of_dataframes)){
results_list[[i]]= train_and_test_lm(list_of_dataframes[[i]], test_data)
}
# Combine the results into a single dataframe
results_df <- do.call(rbind, lapply(results_list, function(result) result$coefficients))
results_df2 <- do.call(rbind, lapply(results_list, function(result) result$mse))
colnames(results_df2)="mse"
df=cbind.data.frame(results_df,results_df2)
# Function to calculate mean and standard deviation for each column
calculate_stats <- function(df) {
# Calculate mean and standard deviation for each column
means <- colMeans(df, na.rm = TRUE)
sds <- apply(df, 2, sd, na.rm = TRUE)
# Combine means and standard deviations into a dataframe
stats_df <- data.frame(Column = names(means), Mean = means, SD = sds)
return(stats_df)
}
# Apply the function to your dataframe
stats_df <- calculate_stats(df)
ll=list(as.data.frame(results_df),as.data.frame(results_df2),stats_df)
# Write the results to an Excel file
write_xlsx(ll, "vitd_ctgan_prediction.xlsx")